diff options
Diffstat (limited to 'fs')
485 files changed, 33089 insertions, 9493 deletions
@@ -1608,7 +1608,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) - aio_rw_done(req, call_read_iter(file, req, &iter)); + aio_rw_done(req, file->f_op->read_iter(req, &iter)); kfree(iovec); return ret; } @@ -1639,7 +1639,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, if (S_ISREG(file_inode(file)->i_mode)) kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; - aio_rw_done(req, call_write_iter(file, req, &iter)); + aio_rw_done(req, file->f_op->write_iter(req, &iter)); } kfree(iovec); return ret; diff --git a/fs/backing-file.c b/fs/backing-file.c index 740185198db3..afb557446c27 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -52,6 +52,29 @@ struct file *backing_file_open(const struct path *user_path, int flags, } EXPORT_SYMBOL_GPL(backing_file_open); +struct file *backing_tmpfile_open(const struct path *user_path, int flags, + const struct path *real_parentpath, + umode_t mode, const struct cred *cred) +{ + struct mnt_idmap *real_idmap = mnt_idmap(real_parentpath->mnt); + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred); + if (IS_ERR(f)) + return f; + + path_get(user_path); + *backing_file_user_path(f) = *user_path; + error = vfs_tmpfile(real_idmap, real_parentpath, f, mode); + if (error) { + fput(f); + f = ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL(backing_tmpfile_open); + struct backing_aio { struct kiocb iocb; refcount_t ref; diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 5c180fdc3efb..250d6c6d3a3a 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -282,18 +282,12 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct posix_acl *acl = NULL; - struct bkey_s_c k; - int ret; retry: bch2_trans_begin(trans); - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash, inode_inum(inode), &search, 0); - if (ret) - goto err; - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash, inode_inum(inode), &search, 0); + int ret = bkey_err(k); if (ret) goto err; @@ -366,7 +360,7 @@ retry: ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?: bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto btree_err; @@ -414,39 +408,30 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); struct btree_iter iter; - struct bkey_s_c_xattr xattr; - struct bkey_i_xattr *new; struct posix_acl *acl = NULL; - struct bkey_s_c k; - int ret; - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inum, &search, BTREE_ITER_INTENT); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash_info, inum, &search, BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) return bch2_err_matches(ret, ENOENT) ? 0 : ret; - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); ret = PTR_ERR_OR_ZERO(acl); - if (IS_ERR_OR_NULL(acl)) + if (ret) goto err; - ret = allocate_dropping_locks_errcode(trans, - __posix_acl_chmod(&acl, _gfp, mode)); + ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); if (ret) goto err; - new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); - if (IS_ERR(new)) { - ret = PTR_ERR(new); + struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); + ret = PTR_ERR_OR_ZERO(new); + if (ret) goto err; - } new->k.p = iter.pos; ret = bch2_trans_update(trans, &iter, &new->k_i, 0); diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 534ba2b02bd6..346cd91f91f9 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -195,7 +195,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) } int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -211,7 +211,7 @@ fsck_err: } int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -225,7 +225,7 @@ fsck_err: } int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -239,7 +239,7 @@ fsck_err: } int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); int ret = 0; @@ -263,7 +263,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe, + bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe, c, err, alloc_key_empty_but_have_data, "empty data type free but have data"); break; @@ -330,27 +330,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); prt_newline(out); - prt_printf(out, "journal_seq %llu", a->journal_seq); - prt_newline(out); - prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_newline(out); - prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_newline(out); - prt_printf(out, "dirty_sectors %u", a->dirty_sectors); - prt_newline(out); - prt_printf(out, "cached_sectors %u", a->cached_sectors); - prt_newline(out); - prt_printf(out, "stripe %u", a->stripe); - prt_newline(out); - prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); - prt_newline(out); - prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); - prt_newline(out); - prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); - prt_newline(out); - prt_printf(out, "fragmentation %llu", a->fragmentation_lru); - prt_newline(out); - prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); + prt_printf(out, "journal_seq %llu\n", a->journal_seq); + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); + prt_printf(out, "cached_sectors %u\n", a->cached_sectors); + prt_printf(out, "stripe %u\n", a->stripe); + prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); + prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); + prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); + prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru); + prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); printbuf_indent_sub(out, 2); } @@ -439,22 +429,18 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct b } struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) +bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) { - struct bkey_s_c k; - struct bkey_i_alloc_v4 *a; - int ret; - - k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + int ret = bkey_err(k); if (unlikely(ret)) return ERR_PTR(ret); - a = bch2_alloc_to_v4_mut_inlined(trans, k); + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); ret = PTR_ERR_OR_ZERO(a); if (unlikely(ret)) goto err; @@ -464,6 +450,20 @@ err: return ERR_PTR(ret); } +__flatten +struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ERR_PTR(ret); + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + return unlikely(ret) ? ERR_PTR(ret) : a; +} + static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) { *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; @@ -487,7 +487,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) } int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -520,7 +520,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) int ret; ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -567,29 +567,31 @@ iter_err: int bch2_alloc_read(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); + struct bch_dev *ca = NULL; int ret; down_read(&c->gc_lock); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; if (k.k->type != KEY_TYPE_bucket_gens) continue; - const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; - + ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ - if (!bch2_dev_exists2(c, k.k->p.inode)) + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; + } - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; for (u64 b = max_t(u64, ca->mi.first_bucket, start); b < min_t(u64, ca->mi.nbuckets, end); @@ -599,15 +601,16 @@ int bch2_alloc_read(struct bch_fs *c) })); } else { ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ - if (!bch2_dev_bucket_exists(c, k.k->p)) + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; - - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + } struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; @@ -615,6 +618,7 @@ int bch2_alloc_read(struct bch_fs *c) })); } + bch2_dev_put(ca); bch2_trans_put(trans); up_read(&c->gc_lock); @@ -625,12 +629,12 @@ int bch2_alloc_read(struct bch_fs *c) /* Free space/discard btree: */ static int bch2_bucket_do_index(struct btree_trans *trans, + struct bch_dev *ca, struct bkey_s_c alloc_k, const struct bch_alloc_v4 *a, bool set) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); struct btree_iter iter; struct bkey_s_c old; struct bkey_i *k; @@ -667,7 +671,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans, old = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bkey_err(old); if (ret) return ret; @@ -711,8 +715,8 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, return ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_intent| + BTREE_ITER_with_updates); ret = bkey_err(k); if (ret) return ret; @@ -734,26 +738,24 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, int bch2_trigger_alloc(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; int ret = 0; - if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, - "alloc key for invalid device or bucket")) + struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); + if (!ca) return -EIO; - struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); - struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; - new_a->data_type = alloc_data_type(*new_a, new_a->data_type); + alloc_data_type_set(new_a, new_a->data_type); - if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) { + if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); @@ -770,10 +772,10 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (old_a->data_type != new_a->data_type || (new_a->data_type == BCH_DATA_free && alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, old, old_a, false) ?: - bch2_bucket_do_index(trans, new.s_c, new_a, true); + ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: + bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); if (ret) - return ret; + goto err; } if (new_a->data_type == BCH_DATA_cached && @@ -787,24 +789,23 @@ int bch2_trigger_alloc(struct btree_trans *trans, bucket_to_u64(new.k->p), old_lru, new_lru); if (ret) - return ret; + goto err; } - new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, - bch_dev_bkey_exists(c, new.k->p.inode)); + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca); if (old_a->fragmentation_lru != new_a->fragmentation_lru) { ret = bch2_lru_change(trans, BCH_LRU_FRAGMENTATION_START, bucket_to_u64(new.k->p), old_a->fragmentation_lru, new_a->fragmentation_lru); if (ret) - return ret; + goto err; } if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); if (ret) - return ret; + goto err; } /* @@ -812,21 +813,21 @@ int bch2_trigger_alloc(struct btree_trans *trans, * not: */ - if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + if ((flags & BTREE_TRIGGER_bucket_invalidate) && old_a->cached_sectors) { ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, -((s64) old_a->cached_sectors)); if (ret) - return ret; + goto err; } } - if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; u64 journal_seq = trans->journal_res.seq; u64 bucket_journal_seq = new_a->journal_seq; - if ((flags & BTREE_TRIGGER_INSERT) && + if ((flags & BTREE_TRIGGER_insert) && data_type_is_empty(old_a->data_type) != data_type_is_empty(new_a->data_type) && new.k->type == KEY_TYPE_alloc_v4) { @@ -854,7 +855,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (ret) { bch2_fs_fatal_error(c, "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)); - return ret; + goto err; } } @@ -884,11 +885,11 @@ int bch2_trigger_alloc(struct btree_trans *trans, bch2_do_invalidates(c); if (statechange(a->data_type == BCH_DATA_need_gc_gens)) - bch2_do_gc_gens(c); + bch2_gc_gens_async(c); } - if ((flags & BTREE_TRIGGER_GC) && - (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) { + if ((flags & BTREE_TRIGGER_gc) && + (flags & BTREE_TRIGGER_bucket_invalidate)) { struct bch_alloc_v4 new_a_convert; const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); @@ -908,12 +909,13 @@ int bch2_trigger_alloc(struct btree_trans *trans, bucket_unlock(g); percpu_up_read(&c->mark_lock); } - - return 0; +err: + bch2_dev_put(ca); + return ret; } /* - * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for + * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for * extents style btrees, but works on non-extents btrees: */ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) @@ -958,35 +960,34 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos } } -static bool next_bucket(struct bch_fs *c, struct bpos *bucket) +static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) { - struct bch_dev *ca; + if (*ca) { + if (bucket->offset < (*ca)->mi.first_bucket) + bucket->offset = (*ca)->mi.first_bucket; - if (bch2_dev_bucket_exists(c, *bucket)) - return true; - - if (bch2_dev_exists2(c, bucket->inode)) { - ca = bch_dev_bkey_exists(c, bucket->inode); - - if (bucket->offset < ca->mi.first_bucket) { - bucket->offset = ca->mi.first_bucket; + if (bucket->offset < (*ca)->mi.nbuckets) return true; - } + bch2_dev_put(*ca); + *ca = NULL; bucket->inode++; bucket->offset = 0; } rcu_read_lock(); - ca = __bch2_next_dev_idx(c, bucket->inode, NULL); - if (ca) - *bucket = POS(ca->dev_idx, ca->mi.first_bucket); + *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); + if (*ca) { + *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); + bch2_dev_get(*ca); + } rcu_read_unlock(); - return ca != NULL; + return *ca != NULL; } -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, + struct bch_dev **ca, struct bkey *hole) { struct bch_fs *c = iter->trans->c; struct bkey_s_c k; @@ -995,22 +996,21 @@ again: if (bkey_err(k)) return k; + *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); + if (!k.k->type) { - struct bpos bucket = bkey_start_pos(k.k); + struct bpos hole_start = bkey_start_pos(k.k); - if (!bch2_dev_bucket_exists(c, bucket)) { - if (!next_bucket(c, &bucket)) + if (!*ca || !bucket_valid(*ca, hole_start.offset)) { + if (!next_bucket(c, ca, &hole_start)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bucket); + bch2_btree_iter_set_pos(iter, hole_start); goto again; } - if (!bch2_dev_bucket_exists(c, k.k->p)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - - bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); - } + if (k.k->p.offset > (*ca)->mi.nbuckets) + bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); } return k; @@ -1025,24 +1025,25 @@ int bch2_check_alloc_key(struct btree_trans *trans, struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; - struct bch_dev *ca; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; unsigned discard_key_type, freespace_key_type; unsigned gens_offset; struct bkey_s_c k; struct printbuf buf = PRINTBUF; - int ret; + int ret = 0; - if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, - alloc_key_to_missing_dev_bucket, + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); + if (fsck_err_on(!ca, + c, alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) - return bch2_btree_delete_at(trans, alloc_iter, 0); + ret = bch2_btree_delete_at(trans, alloc_iter, 0); + if (!ca) + return ret; - ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); if (!ca->mi.freespace_initialized) - return 0; + goto out; a = bch2_alloc_to_v4(alloc_k, &a_convert); @@ -1141,25 +1142,26 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; } +out: err: fsck_err: + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + struct bch_dev *ca, struct bpos start, struct bpos *end, struct btree_iter *freespace_iter) { struct bch_fs *c = trans->c; - struct bch_dev *ca; struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; - ca = bch_dev_bkey_exists(c, start.inode); if (!ca->mi.freespace_initialized) return 0; @@ -1313,7 +1315,7 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran goto delete; out: fsck_err: - set_btree_iter_dontneed(&alloc_iter); + bch2_set_btree_iter_dontneed(&alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; @@ -1337,30 +1339,25 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_i_bucket_gens g; - struct bch_dev *ca; u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; u64 b; - bool need_update = false, dev_exists; + bool need_update = false; struct printbuf buf = PRINTBUF; int ret = 0; BUG_ON(k.k->type != KEY_TYPE_bucket_gens); bkey_reassemble(&g.k_i, k); - /* if no bch_dev, skip out whether we repair or not */ - dev_exists = bch2_dev_exists2(c, k.k->p.inode); - if (!dev_exists) { - if (fsck_err_on(!dev_exists, c, - bucket_gens_to_invalid_dev, - "bucket_gens key for invalid device:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); + if (!ca) { + if (fsck_err(c, bucket_gens_to_invalid_dev, + "bucket_gens key for invalid device:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, 0); - } goto out; } - ca = bch_dev_bkey_exists(c, k.k->p.inode); if (fsck_err_on(end <= ca->mi.first_bucket || start >= ca->mi.nbuckets, c, bucket_gens_to_invalid_buckets, @@ -1398,6 +1395,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, } out: fsck_err: + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } @@ -1406,25 +1404,26 @@ int bch2_check_alloc_info(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; + struct bch_dev *ca = NULL; struct bkey hole; struct bkey_s_c k; int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); while (1) { struct bpos next; bch2_trans_begin(trans); - k = bch2_get_key_or_real_bucket_hole(&iter, &hole); + k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -1445,7 +1444,7 @@ int bch2_check_alloc_info(struct bch_fs *c) } else { next = k.k->p; - ret = bch2_check_alloc_hole_freespace(trans, + ret = bch2_check_alloc_hole_freespace(trans, ca, bkey_start_pos(k.k), &next, &freespace_iter) ?: @@ -1473,19 +1472,21 @@ bkey_err: bch2_trans_iter_exit(trans, &freespace_iter); bch2_trans_iter_exit(trans, &discard_iter); bch2_trans_iter_exit(trans, &iter); + bch2_dev_put(ca); + ca = NULL; if (ret < 0) goto err; ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, bch2_check_discard_freespace_key(trans, &iter)); if (ret) goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); while (1) { bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -1515,7 +1516,7 @@ bkey_err: ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); err: @@ -1562,7 +1563,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_NORUN); + &a_mut->k_i, BTREE_TRIGGER_norun); if (ret) goto err; @@ -1601,7 +1602,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_PREFETCH, k, + POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_alloc_to_lru_ref(trans, &iter))); bch_err_fn(c, ret); @@ -1657,9 +1658,7 @@ static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_st bch2_journal_flush_async(&c->journal, NULL); if (s->ca) - percpu_ref_put(&s->ca->ref); - if (ca) - percpu_ref_get(&ca->ref); + percpu_ref_put(&s->ca->io_ref); s->ca = ca; s->need_journal_commit_this_dev = 0; } @@ -1673,15 +1672,15 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bpos pos = need_discard_iter->pos; struct btree_iter iter = { NULL }; struct bkey_s_c k; - struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; bool discard_locked = false; int ret = 0; - ca = bch_dev_bkey_exists(c, pos.inode); - - if (!percpu_ref_tryget(&ca->io_ref)) { + struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode + ? s->ca + : bch2_dev_get_ioref(c, pos.inode, WRITE); + if (!ca) { bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); return 0; } @@ -1703,7 +1702,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, need_discard_iter->pos, - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (ret) goto out; @@ -1713,7 +1712,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (a->v.dirty_sectors) { + if (bch2_bucket_sectors_total(a->v)) { if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "attempting to discard bucket with dirty data\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) @@ -1771,7 +1770,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - a->v.data_type = alloc_data_type(a->v, a->v.data_type); + alloc_data_type_set(&a->v, a->v.data_type); write: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, @@ -1787,7 +1786,6 @@ out: discard_in_flight_remove(c, iter.pos); s->seen++; bch2_trans_iter_exit(trans, &iter); - percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); return ret; } @@ -1827,7 +1825,7 @@ void bch2_do_discards(struct bch_fs *c) static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) { struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) @@ -1840,7 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo BUG_ON(a->v.dirty_sectors); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - a->v.data_type = alloc_data_type(a->v, a->v.data_type); + alloc_data_type_set(&a->v, a->v.data_type); ret = bch2_trans_update(trans, &iter, &a->k_i, 0); err: @@ -1862,9 +1860,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) if (i->snapshot) continue; - ca = bch_dev_bkey_exists(c, i->inode); - - if (!percpu_ref_tryget(&ca->io_ref)) { + ca = bch2_dev_get_ioref(c, i->inode, WRITE); + if (!ca) { darray_remove_item(&c->discard_buckets_in_flight, i); continue; } @@ -1903,9 +1900,12 @@ static void bch2_do_discards_fast_work(struct work_struct *work) static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode); + bool dead = !ca || percpu_ref_is_dying(&ca->io_ref); + rcu_read_unlock(); - if (!percpu_ref_is_dying(&ca->io_ref) && + if (!dead && !discard_in_flight_add(c, bucket) && bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && !queue_work(c->write_ref_wq, &c->discard_fast_work)) @@ -1918,7 +1918,6 @@ static int invalidate_one_bucket(struct btree_trans *trans, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = { NULL }; struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); @@ -1936,7 +1935,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); + a = bch2_trans_start_alloc_update(trans, bucket); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; @@ -1961,18 +1960,15 @@ static int invalidate_one_bucket(struct btree_trans *trans, a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); - ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, - BTREE_TRIGGER_BUCKET_INVALIDATE) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + ret = bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); --*nr_to_invalidate; out: - bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; err: @@ -2014,11 +2010,11 @@ static void bch2_do_invalidates_work(struct work_struct *work) ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); if (ret < 0) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); break; } } @@ -2051,7 +2047,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); /* * Scan the alloc btree for every bucket on @ca, and add buckets to the * freespace/need_discard/need_gc_gens btrees as needed: @@ -2083,7 +2079,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - ret = bch2_bucket_do_index(trans, k, a, true) ?: + ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) @@ -2155,7 +2151,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err_fn(c, ret); return ret; } @@ -2182,7 +2178,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, u64 now; int ret = 0; - a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); + if (bch2_trans_relock(trans)) + bch2_trans_begin(trans); + + a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 2790e516383d..ae31a94be6f9 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -8,21 +8,18 @@ #include "debug.h" #include "super.h" -enum bkey_invalid_flags; +enum bch_validate_flags; /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { - struct bch_dev *ca; - - if (!bch2_dev_exists2(c, pos.inode)) - return false; - - ca = bch_dev_bkey_exists(c, pos.inode); - return pos.offset >= ca->mi.first_bucket && - pos.offset < ca->mi.nbuckets; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, pos.inode); + bool ret = ca && bucket_valid(ca, pos.offset); + rcu_read_unlock(); + return ret; } static inline u64 bucket_to_u64(struct bpos bucket) @@ -40,38 +37,50 @@ static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) return a.gen - a.oldest_gen; } -static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, - u32 cached_sectors, - u32 stripe, - struct bch_alloc_v4 a, - enum bch_data_type data_type) +static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src) { - if (stripe) - return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (dirty_sectors) - return data_type; - if (cached_sectors) - return BCH_DATA_cached; - if (BCH_ALLOC_V4_NEED_DISCARD(&a)) - return BCH_DATA_need_discard; - if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) - return BCH_DATA_need_gc_gens; - return BCH_DATA_free; + dst->gen = src.gen; + dst->data_type = src.data_type; + dst->dirty_sectors = src.dirty_sectors; + dst->cached_sectors = src.cached_sectors; + dst->stripe = src.stripe; } -static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, - enum bch_data_type data_type) +static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src) { - return __alloc_data_type(a.dirty_sectors, a.cached_sectors, - a.stripe, a, data_type); + dst->gen = src.gen; + dst->data_type = src.data_type; + dst->dirty_sectors = src.dirty_sectors; + dst->cached_sectors = src.cached_sectors; + dst->stripe = src.stripe; +} + +static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) +{ + struct bch_alloc_v4 ret = {}; + __bucket_m_to_alloc(&ret, b); + return ret; } static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) { - return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; + switch (data_type) { + case BCH_DATA_cached: + case BCH_DATA_stripe: + return BCH_DATA_user; + default: + return data_type; + } +} + +static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, + enum bch_data_type ptr) +{ + return !data_type_is_empty(bucket) && + bucket_data_type(bucket) != bucket_data_type(ptr); } -static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a) +static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a) { return a.dirty_sectors + a.cached_sectors; } @@ -89,6 +98,27 @@ static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca, return d ? max(0, ca->mi.bucket_size - d) : 0; } +static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + if (a.stripe) + return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; + if (a.dirty_sectors) + return data_type; + if (a.cached_sectors) + return BCH_DATA_cached; + if (BCH_ALLOC_V4_NEED_DISCARD(&a)) + return BCH_DATA_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BCH_DATA_need_gc_gens; + return BCH_DATA_free; +} + +static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type) +{ + a->data_type = alloc_data_type(*a, data_type); +} + static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; @@ -147,7 +177,9 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) } struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); +bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *, struct bpos); void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); @@ -173,13 +205,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -213,7 +245,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); }) int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ @@ -233,7 +265,8 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_do_discards(struct bch_fs *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index a1fc30adf912..927a5f300b30 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -71,7 +71,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *c) { rcu_read_lock(); for_each_member_device_rcu(c, ca, NULL) - ca->alloc_cursor = 0; + memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); rcu_read_unlock(); } @@ -100,7 +100,7 @@ static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *o void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); if (ob->ec) { ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); @@ -300,7 +300,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (ret) { ob = ERR_PTR(ret); @@ -342,9 +342,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc struct bch_backpointer bp; struct bpos bp_pos = POS_MIN; - ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, + ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1, &bp_pos, &bp, - BTREE_ITER_NOPRESERVE); + BTREE_ITER_nopreserve); if (ret) { ob = ERR_PTR(ret); goto err; @@ -363,10 +363,10 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); if (!ob) - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); err: if (iter.path) - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ob; @@ -389,7 +389,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bkey_s_c k, ck; struct open_bucket *ob = NULL; u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); - u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; @@ -404,9 +405,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans, */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), - BTREE_ITER_SLOTS, k, ret) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; + BTREE_ITER_slots, k, ret) { + u64 bucket = k.k->p.offset; if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; @@ -415,12 +415,29 @@ again: is_superblock_bucket(ca, k.k->p.offset)) continue; - a = bch2_alloc_to_v4(k, &a_convert); + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { + if (s->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + break; + + bucket = sector_to_bucket(ca, + round_up(bucket_to_sector(ca, bucket) + 1, + 1ULL << ca->mi.btree_bitmap_shift)); + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); + s->buckets_seen++; + s->skipped_mi_btree_bitmap++; + continue; + } + + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); if (a->data_type != BCH_DATA_free) continue; /* now check the cached key to serialize concurrent allocs of the bucket */ - ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); ret = bkey_err(ck); if (ret) break; @@ -433,7 +450,7 @@ again: ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); next: - set_btree_iter_dontneed(&citer); + bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -441,7 +458,6 @@ next: bch2_trans_iter_exit(trans, &iter); alloc_cursor = iter.pos.offset; - ca->alloc_cursor = alloc_cursor; if (!ob && ret) ob = ERR_PTR(ret); @@ -451,6 +467,8 @@ next: goto again; } + *dev_alloc_cursor = alloc_cursor; + return ob; } @@ -463,7 +481,8 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; @@ -485,10 +504,30 @@ again: s->buckets_seen++; + u64 bucket = alloc_cursor & ~(~0ULL << 56); + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { + if (s->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + goto fail; + + bucket = sector_to_bucket(ca, + round_up(bucket_to_sector(ca, bucket) + 1, + 1ULL << ca->mi.btree_bitmap_shift)); + u64 genbits = alloc_cursor >> 56; + alloc_cursor = bucket | (genbits << 56); + + if (alloc_cursor > k.k->p.offset) + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); + s->skipped_mi_btree_bitmap++; + continue; + } + ob = try_alloc_bucket(trans, ca, watermark, alloc_cursor, s, k, cl); if (ob) { - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); break; } } @@ -496,10 +535,9 @@ again: if (ob || ret) break; } +fail: bch2_trans_iter_exit(trans, &iter); - ca->alloc_cursor = alloc_cursor; - if (!ob && ret) ob = ERR_PTR(ret); @@ -508,14 +546,56 @@ again: goto again; } + *dev_alloc_cursor = alloc_cursor; + return ob; } +static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, + enum bch_watermark watermark, + enum bch_data_type data_type, + struct closure *cl, + struct bch_dev_usage *usage, + struct bucket_alloc_state *s, + struct open_bucket *ob) +{ + struct printbuf buf = PRINTBUF; + + printbuf_tabstop_push(&buf, 24); + + prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); + prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); + prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); + prt_printf(&buf, "blocking\t%u\n", cl != NULL); + prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); + prt_printf(&buf, "copygc_wait\t%lu/%lli\n", + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); + prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); + prt_printf(&buf, "open\t%llu\n", s->skipped_open); + prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); + prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); + prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); + prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); + + if (!IS_ERR(ob)) { + prt_printf(&buf, "allocated\t%llu\n", ob->bucket); + trace_bucket_alloc(c, buf.buf); + } else { + prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); + trace_bucket_alloc_fail(c, buf.buf); + } + + printbuf_exit(&buf); +} + /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object * @ca: device to allocate from * @watermark: how important is this allocation? + * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available * @usage: for secondarily also returning the current device usage * @@ -524,6 +604,7 @@ again: static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, enum bch_watermark watermark, + enum bch_data_type data_type, struct closure *cl, struct bch_dev_usage *usage) { @@ -531,7 +612,9 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { 0 }; + struct bucket_alloc_state s = { + .btree_bitmap = data_type == BCH_DATA_btree, + }; bool waiting = false; again: bch2_dev_usage_read_fast(ca, usage); @@ -541,7 +624,7 @@ again: bch2_do_discards(c); if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) - bch2_do_gc_gens(c); + bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) bch2_do_invalidates(c); @@ -569,6 +652,11 @@ alloc: if (s.skipped_need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); + if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { + s.btree_bitmap = BTREE_BITMAP_ANY; + goto alloc; + } + if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; @@ -578,33 +666,24 @@ err: ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - trace_and_count(c, bucket_alloc, ca, - bch2_watermarks[watermark], - ob->bucket, - usage->d[BCH_DATA_free].buckets, - avail, - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - &s, - cl == NULL, - ""); + ob->data_type = data_type; + + if (!IS_ERR(ob)) + count_event(c, bucket_alloc); else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) - trace_and_count(c, bucket_alloc_fail, ca, - bch2_watermarks[watermark], - 0, - usage->d[BCH_DATA_free].buckets, - avail, - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - &s, - cl == NULL, - bch2_err_str(PTR_ERR(ob))); + count_event(c, bucket_alloc_fail); + + if (!IS_ERR(ob) + ? trace_bucket_alloc_enabled() + : trace_bucket_alloc_fail_enabled()) + trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); return ob; } struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_watermark watermark, + enum bch_data_type data_type, struct closure *cl) { struct bch_dev_usage usage; @@ -612,7 +691,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bch2_trans_do(c, NULL, NULL, 0, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - cl, &usage))); + data_type, cl, &usage))); return ob; } @@ -678,8 +757,7 @@ static int add_new_bucket(struct bch_fs *c, unsigned flags, struct open_bucket *ob) { - unsigned durability = - bch_dev_bkey_exists(c, ob->dev)->mi.durability; + unsigned durability = ob_dev(c, ob)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); @@ -711,37 +789,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct bch_fs *c = trans->c; struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); - unsigned dev; - struct bch_dev *ca; int ret = -BCH_ERR_insufficient_devices; - unsigned i; BUG_ON(*nr_effective >= nr_replicas); - for (i = 0; i < devs_sorted.nr; i++) { + for (unsigned i = 0; i < devs_sorted.nr; i++) { struct bch_dev_usage usage; struct open_bucket *ob; - dev = devs_sorted.devs[i]; - - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - + unsigned dev = devs_sorted.devs[i]; + struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); if (!ca) continue; if (!ca->mi.durability && *have_cache) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage); + ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); @@ -750,8 +819,6 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob->data_type = data_type; - if (add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, flags, ob)) { @@ -836,7 +903,7 @@ static bool want_bucket(struct bch_fs *c, bool *have_cache, bool ec, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); if (!test_bit(ob->dev, devs_may_alloc->d)) return false; @@ -906,7 +973,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); struct bch_dev_usage usage; u64 avail; @@ -1291,7 +1358,7 @@ deallocate_extra_replicas(struct bch_fs *c, unsigned i; open_bucket_for_each(c, ptrs, ob, i) { - unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; @@ -1342,6 +1409,10 @@ retry: *wp_ret = wp = writepoint_find(trans, write_point.v); + ret = bch2_trans_relock(trans); + if (ret) + goto err; + /* metadata may not allocate on cache devices: */ if (wp->data_type != BCH_DATA_user) have_cache = true; @@ -1444,7 +1515,7 @@ err: struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); return (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, @@ -1520,7 +1591,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ @@ -1622,3 +1693,104 @@ void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) prt_str(out, "Btree write point\n"); bch2_write_point_to_text(out, c, &c->btree_write_point); } + +void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) +{ + unsigned nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + printbuf_tabstop_push(out, 24); + + percpu_down_read(&c->mark_lock); + prt_printf(out, "hidden\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.hidden)); + prt_printf(out, "btree\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.btree)); + prt_printf(out, "data\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.data)); + prt_printf(out, "cached\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.cached)); + prt_printf(out, "reserved\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.reserved)); + prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); + prt_printf(out, "nr_inodes\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes)); + percpu_up_read(&c->mark_lock); + + prt_newline(out); + prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); + prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); + prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); + prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); + prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); + prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); + prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); +} + +void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + unsigned nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + + bch2_dev_usage_to_text(out, &stats); + + prt_newline(out); + + prt_printf(out, "reserves:\n"); + for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) + prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); + + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + + prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); + prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); +} + +void bch2_print_allocator_stuck(struct bch_fs *c) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n"); + + prt_printf(&buf, "Allocator debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_fs_alloc_debug_to_text(&buf, c); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + + for_each_online_member(c, ca) { + prt_printf(&buf, "Dev %u:\n", ca->dev_idx); + printbuf_indent_add(&buf, 2); + bch2_dev_alloc_debug_to_text(&buf, ca); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + prt_printf(&buf, "Copygc debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_copygc_wait_to_text(&buf, c); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + + prt_printf(&buf, "Journal debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_journal_debug_to_text(&buf, &c->journal); + printbuf_indent_sub(&buf, 2); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 7aaeec44c746..a42c9730d32a 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -30,8 +30,14 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); long bch2_bucket_alloc_new_fs(struct bch_dev *); +static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) +{ + return bch2_dev_have_ref(c, ob->dev); +} + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum bch_watermark, struct closure *); + enum bch_watermark, enum bch_data_type, + struct closure *); static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, struct open_bucket *ob) @@ -184,7 +190,7 @@ bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, wp->sectors_allocated += sectors; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); ptr.cached = cached || @@ -221,4 +227,9 @@ void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); +void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); + +void bch2_print_allocator_stuck(struct bch_fs *); + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index c2226e947c41..9bbb28e90b93 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -9,11 +9,18 @@ #include "fifo.h" struct bucket_alloc_state { + enum { + BTREE_BITMAP_NO, + BTREE_BITMAP_YES, + BTREE_BITMAP_ANY, + } btree_bitmap; + u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; u64 skipped_nocow; u64 skipped_nouse; + u64 skipped_mi_btree_bitmap; }; #define BCH_WATERMARKS() \ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index af7a71de1bdf..692b1c7d5018 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -23,6 +23,7 @@ static bool extent_matches_bp(struct bch_fs *c, const union bch_extent_entry *entry; struct extent_ptr_decoded p; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bpos bucket2; struct bch_backpointer bp2; @@ -30,31 +31,43 @@ static bool extent_matches_bp(struct bch_fs *c, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2); + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (!ca) + continue; + + bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); if (bpos_eq(bucket, bucket2) && - !memcmp(&bp, &bp2, sizeof(bp))) + !memcmp(&bp, &bp2, sizeof(bp))) { + rcu_read_unlock(); return true; + } } + rcu_read_unlock(); return false; } int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - /* these will be caught by fsck */ - if (!bch2_dev_exists2(c, bp.k->p.inode)) + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode); + if (!ca) { + /* these will be caught by fsck */ + rcu_read_unlock(); return 0; + } - struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode); - struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); + struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); + rcu_read_unlock(); int ret = 0; bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || - !bpos_eq(bp.k->p, bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset)), + !bpos_eq(bp.k->p, bp_pos), c, err, backpointer_bucket_offset_wrong, "backpointer bucket_offset wrong"); @@ -75,10 +88,16 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - if (bch2_dev_exists2(c, k.k->p.inode)) { + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode); + if (ca) { + struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); + rcu_read_unlock(); prt_str(out, "bucket="); - bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + bch2_bpos_to_text(out, bucket); prt_str(out, " "); + } else { + rcu_read_unlock(); } bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); @@ -117,8 +136,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch_err(c, "%s", buf.buf); } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - prt_printf(&buf, "backpointer not found when deleting"); - prt_newline(&buf); + prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); prt_printf(&buf, "searching for "); @@ -145,6 +163,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, @@ -161,7 +180,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, return ret; bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); bp_k->v = bp; if (!insert) { @@ -171,9 +190,9 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_intent| + BTREE_ITER_slots| + BTREE_ITER_with_updates); ret = bkey_err(k); if (ret) goto err; @@ -197,13 +216,13 @@ err: * Find the next backpointer >= *bp_offset: */ int bch2_get_next_backpointer(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, int gen, struct bpos *bp_pos, struct bch_backpointer *bp, unsigned iter_flags) { - struct bch_fs *c = trans->c; - struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0); struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; struct bkey_s_c k; int ret = 0; @@ -213,7 +232,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans, if (gen >= 0) { k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED|iter_flags); + bucket, BTREE_ITER_cached|iter_flags); ret = bkey_err(k); if (ret) goto out; @@ -223,7 +242,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans, goto done; } - *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); + *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0)); for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, *bp_pos, iter_flags, k, ret) { @@ -249,7 +268,6 @@ static void backpointer_not_found(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); /* * If we're using the btree write buffer, the backpointer we were @@ -259,6 +277,10 @@ static void backpointer_not_found(struct btree_trans *trans, if (likely(!bch2_backpointers_no_use_write_buffer)) return; + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return; + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", bp.level ? "btree node" : "extent"); prt_printf(&buf, "bucket: "); @@ -288,15 +310,17 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, { if (likely(!bp.level)) { struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct bkey_s_c k; + + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return bkey_s_c_err(-EIO); bch2_trans_node_iter_init(trans, iter, bp.btree_id, bp.pos, 0, 0, iter_flags); - k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; @@ -325,18 +349,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct bch_backpointer bp) { struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct btree *b; BUG_ON(!bp.level); + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return ERR_PTR(-EIO); + bch2_trans_node_iter_init(trans, iter, bp.btree_id, bp.pos, 0, bp.level - 1, 0); - b = bch2_btree_iter_peek_node(iter); + struct btree *b = bch2_btree_iter_peek_node(iter); if (IS_ERR_OR_NULL(b)) goto err; @@ -367,16 +393,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ struct printbuf buf = PRINTBUF; int ret = 0; - if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, - backpointer_to_missing_device, - "backpointer for missing device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, bp_iter, 0); + struct bpos bucket; + if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { + if (fsck_err(c, backpointer_to_missing_device, + "backpointer for missing device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bp_pos_to_bucket(c, k.k->p), 0); + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); ret = bkey_err(alloc_k); if (ret) goto out; @@ -460,8 +486,8 @@ found: bytes = p.crc.compressed_size << 9; - struct bch_dev *ca = bch_dev_bkey_exists(c, dev); - if (!bch2_dev_get_ioref(ca, READ)) + struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); + if (!ca) return false; data_buf = kvmalloc(bytes, GFP_KERNEL); @@ -511,25 +537,27 @@ static int check_bp_exists(struct btree_trans *trans, struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; struct bkey_buf tmp; - int ret; + int ret = 0; bch2_bkey_buf_init(&tmp); - if (!bch2_dev_bucket_exists(c, bucket)) { + struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); + if (!ca) { prt_str(&buf, "extent for nonexistent device:bucket "); bch2_bpos_to_text(&buf, bucket); prt_str(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); - return -BCH_ERR_fsck_repair_unimplemented; + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; } if (bpos_lt(bucket, s->bucket_start) || bpos_gt(bucket, s->bucket_end)) - return 0; + goto out; bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, bucket, bp.bucket_offset), + bucket_pos_to_bp(ca, bucket, bp.bucket_offset), 0); ret = bkey_err(bp_k); if (ret) @@ -562,6 +590,7 @@ fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&tmp, c); + bch2_dev_put(ca); printbuf_exit(&buf); return ret; check_existing_bp: @@ -637,13 +666,13 @@ missing: struct bkey_i_backpointer n_bp_k; bkey_backpointer_init(&n_bp_k.k_i); - n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); n_bp_k.v = bp; prt_printf(&buf, "\n want: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); + ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true); goto out; } @@ -667,7 +696,14 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (ca) + bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp); + rcu_read_unlock(); + + if (!ca) + continue; ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) @@ -760,7 +796,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b, ret) { + 0, depth, BTREE_ITER_prefetch, b, ret) { mem_may_pin -= btree_buf_bytes(b); if (mem_may_pin <= 0) { c->btree_cache.pinned_nodes_end = *end = @@ -794,31 +830,13 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, while (level >= depth) { struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - level, - BTREE_ITER_PREFETCH); - while (1) { - bch2_trans_begin(trans); - - struct bkey_s_c k = bch2_btree_iter_peek(&iter); - if (!k.k) - break; - ret = bkey_err(k) ?: - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - if (ret) - break; - if (bpos_eq(iter.pos, SPOS_MAX)) - break; - bch2_btree_iter_advance(&iter); - } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, + BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + })); if (ret) return ret; @@ -936,7 +954,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bpos last_flushed_pos = SPOS_MAX; return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_PREFETCH, k, + POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index c1b274eadda1..6021de1c5e98 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -6,6 +6,7 @@ #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" +#include "error.h" #include "super.h" static inline u64 swab40(u64 x) @@ -18,7 +19,7 @@ static inline u64 swab40(u64 x) } int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); @@ -36,15 +37,29 @@ void bch2_backpointer_swab(struct bkey_s); * Convert from pos in backpointer btree to pos of corresponding bucket in alloc * btree: */ -static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, - struct bpos bp_pos) +static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); } +static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode); + if (ca) + *bucket = bp_pos_to_bucket(ca, bp_pos); + rcu_read_unlock(); + return ca != NULL; +} + +static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) +{ + return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket), + c, "backpointer for missing device %llu", bp_pos.inode); +} + static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) @@ -57,32 +72,32 @@ static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, /* * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: */ -static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, +static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset); - EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); + EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret))); return ret; } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket, - struct bch_backpointer, struct bkey_s_c, bool); +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *, + struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) { if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); + return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert); struct bkey_i_backpointer bp_k; bkey_backpointer_init(&bp_k.k_i); - bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); bp_k.v = bp; if (!insert) { @@ -120,7 +135,7 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, } } -static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, +static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, @@ -130,7 +145,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, s64 sectors = level ? btree_sectors(c) : k.k->size; u32 bucket_offset; - *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); + *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); *bp = (struct bch_backpointer) { .btree_id = btree_id, .level = level, @@ -142,7 +157,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, }; } -int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, +int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int, struct bpos *, struct bch_backpointer *, unsigned); struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, struct bpos, struct bch_backpointer, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 91c3c1fef233..bc0ea2c4efef 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -359,6 +359,8 @@ do { \ #define BCH_DEBUG_PARAMS_ALWAYS() \ BCH_DEBUG_PARAM(key_merging_disabled, \ "Disables merging of extents") \ + BCH_DEBUG_PARAM(btree_node_merging_disabled, \ + "Disables merging of btree nodes") \ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ "Causes mark and sweep to compact and rewrite every " \ "btree node it traverses") \ @@ -468,6 +470,7 @@ enum bch_time_stats { #include "quota_types.h" #include "rebalance_types.h" #include "replicas_types.h" +#include "sb-members_types.h" #include "subvolume_types.h" #include "super_types.h" #include "thread_with_file_types.h" @@ -516,8 +519,8 @@ enum gc_phase { struct gc_pos { enum gc_phase phase; + u16 level; struct bpos pos; - unsigned level; }; struct reflink_gc { @@ -534,7 +537,13 @@ struct io_count { struct bch_dev { struct kobject kobj; +#ifdef CONFIG_BCACHEFS_DEBUG + atomic_long_t ref; + bool dying; + unsigned long last_put; +#else struct percpu_ref ref; +#endif struct completion ref_completion; struct percpu_ref io_ref; struct completion io_ref_completion; @@ -560,14 +569,11 @@ struct bch_dev { struct bch_devs_mask self; - /* biosets used in cloned bios for writing multiple replicas */ - struct bio_set replica_set; - /* * Buckets: * Per-bucket arrays are protected by c->mark_lock, bucket_lock and * gc_lock, for device resize - holding any is sufficient for access: - * Or rcu_read_lock(), but only for ptr_stale(): + * Or rcu_read_lock(), but only for dev_ptr_stale(): */ struct bucket_array __rcu *buckets_gc; struct bucket_gens __rcu *bucket_gens; @@ -581,7 +587,7 @@ struct bch_dev { /* Allocator: */ u64 new_fs_bucket_idx; - u64 alloc_cursor; + u64 alloc_cursor[3]; unsigned nr_open_buckets; unsigned nr_btree_reserve; @@ -627,12 +633,12 @@ struct bch_dev { x(clean_shutdown) \ x(fsck_running) \ x(initial_gc_unfixed) \ - x(need_another_gc) \ x(need_delete_dead_snapshots) \ x(error) \ x(topology_error) \ x(errors_fixed) \ - x(errors_not_fixed) + x(errors_not_fixed) \ + x(no_invalid_checks) enum bch_fs_flags { #define x(n) BCH_FS_##n, @@ -715,6 +721,7 @@ struct btree_trans_buf { x(discard_fast) \ x(invalidate) \ x(delete_dead_snapshots) \ + x(gc_gens) \ x(snapshot_delete_pagecache) \ x(sysfs) \ x(btree_write_buffer) @@ -926,7 +933,6 @@ struct bch_fs { /* JOURNAL SEQ BLACKLIST */ struct journal_seq_blacklist_table * journal_seq_blacklist_table; - struct work_struct journal_seq_blacklist_gc_work; /* ALLOCATOR */ spinlock_t freelist_lock; @@ -957,8 +963,7 @@ struct bch_fs { struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ - struct task_struct *gc_thread; - atomic_t kick_gc; + struct work_struct gc_gens_work; unsigned long gc_count; enum btree_id gc_gens_btree; @@ -988,6 +993,7 @@ struct bch_fs { struct bio_set bio_read; struct bio_set bio_read_split; struct bio_set bio_write; + struct bio_set replica_set; struct mutex bio_bounce_pages_lock; mempool_t bio_bounce_pages; struct bucket_nocow_lock_table @@ -1115,7 +1121,6 @@ struct bch_fs { u64 counters_on_mount[BCH_COUNTER_NR]; u64 __percpu *counters; - unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; bool promote_whole_extents; @@ -1250,11 +1255,6 @@ static inline s64 bch2_current_time(const struct bch_fs *c) return timespec_to_bch2_time(c, now); } -static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) -{ - return dev < c->sb.nr_devices && c->devs[dev]; -} - static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) { struct stdio_redirect *stdio = c->stdio; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 2e8b1a489c20..1bebba881d89 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -76,6 +76,7 @@ #include <asm/byteorder.h> #include <linux/kernel.h> #include <linux/uuid.h> +#include <uapi/linux/magic.h> #include "vstructs.h" #ifdef __KERNEL__ @@ -589,6 +590,13 @@ struct bch_member { __le64 errors_reset_time; __le64 seq; __le64 btree_allocated_bitmap; + /* + * On recovery from a clean shutdown we don't normally read the journal, + * but we still want to resume writing from where we left off so we + * don't overwrite more than is necessary, for list journal debugging: + */ + __le32 last_journal_bucket; + __le32 last_journal_bucket_offset; }; /* @@ -1283,7 +1291,7 @@ enum bch_compression_opts { UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) -#define BCACHEFS_STATFS_MAGIC 0xca451a4e +#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC #define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) #define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 76e79a15ba08..f46978e5cb7c 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -640,7 +640,7 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) int bch2_bkey_format_invalid(struct bch_fs *c, struct bkey_format *f, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { unsigned i, bits = KEY_PACKED_BITS_START; @@ -656,20 +656,17 @@ int bch2_bkey_format_invalid(struct bch_fs *c, * unpacked format: */ for (i = 0; i < f->nr_fields; i++) { - if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) { + if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) && + bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); u64 packed_max = f->bits_per_field[i] ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) : 0; - u64 field_offset = le64_to_cpu(f->field_offset[i]); - if (packed_max + field_offset < packed_max || - packed_max + field_offset > unpacked_max) { - prt_printf(err, "field %u too large: %llu + %llu > %llu", - i, packed_max, field_offset, unpacked_max); - return -BCH_ERR_invalid; - } + prt_printf(err, "field %u too large: %llu + %llu > %llu", + i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max); + return -BCH_ERR_invalid; } bits += f->bits_per_field[i]; diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 3a45d128f608..fcd43915df07 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -9,10 +9,10 @@ #include "util.h" #include "vstructs.h" -enum bkey_invalid_flags { - BKEY_INVALID_WRITE = (1U << 0), - BKEY_INVALID_COMMIT = (1U << 1), - BKEY_INVALID_JOURNAL = (1U << 2), +enum bch_validate_flags { + BCH_VALIDATE_write = (1U << 0), + BCH_VALIDATE_commit = (1U << 1), + BCH_VALIDATE_journal = (1U << 2), }; #if 0 @@ -574,8 +574,31 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); + +static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i) +{ + unsigned f_bits = f->bits_per_field[i]; + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (f_bits > unpacked_bits) + return true; + + if ((f_bits == unpacked_bits) && field_offset) + return true; + + u64 f_mask = f_bits + ? ~((~0ULL << (f_bits - 1)) << 1) + : 0; + + if (((field_offset + f_mask) & unpacked_mask) < field_offset) + return true; + return false; +} + int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); #endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index a275a9e8e341..c2c3dae52186 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -27,7 +27,7 @@ const char * const bch2_bkey_types[] = { }; static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { return 0; } @@ -41,7 +41,7 @@ static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, }) static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -58,7 +58,7 @@ fsck_err: }) static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { return 0; } @@ -82,7 +82,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, }) static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { return 0; } @@ -123,9 +123,12 @@ const struct bkey_ops bch2_bkey_null_ops = { }; int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); int ret = 0; @@ -159,9 +162,12 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; + int ret = 0; bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, @@ -172,7 +178,7 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, return 0; bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && - (type == BKEY_TYPE_btree || (flags & BKEY_INVALID_COMMIT)) && + (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", @@ -224,7 +230,7 @@ fsck_err: int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { return __bch2_bkey_invalid(c, k, type, flags, err) ?: diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 03efe8ee565a..726ef7483763 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -22,14 +22,15 @@ extern const struct bkey_ops bch2_bkey_null_ops; */ struct bkey_ops { int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err); + enum bch_validate_flags flags, struct printbuf *err); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); int (*trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); @@ -48,11 +49,11 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) } int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, struct printbuf *); @@ -76,56 +77,10 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -enum btree_update_flags { - __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, - __BTREE_UPDATE_NOJOURNAL, - __BTREE_UPDATE_KEY_CACHE_RECLAIM, - - __BTREE_TRIGGER_NORUN, - __BTREE_TRIGGER_TRANSACTIONAL, - __BTREE_TRIGGER_ATOMIC, - __BTREE_TRIGGER_GC, - __BTREE_TRIGGER_INSERT, - __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_BUCKET_INVALIDATE, -}; - -#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) -#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) -#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) - -/* Don't run triggers at all */ -#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) - -/* - * If set, we're running transactional triggers as part of a transaction commit: - * triggers may generate new updates - * - * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set, - * we're running atomic triggers during a transaction commit: we have our - * journal reservation, we're holding btree node write locks, and we know the - * transaction is going to commit (returning an error here is a fatal error, - * causing us to go emergency read-only) - */ -#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) -#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC) - -/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) - -/* @new is entering the btree */ -#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) - -/* @old is leaving the btree */ -#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) - -/* signal from bucket invalidate path to alloc trigger */ -#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) - static inline int bch2_key_trigger(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); @@ -135,8 +90,9 @@ static inline int bch2_key_trigger(struct btree_trans *trans, } static inline int bch2_key_trigger_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, unsigned flags) + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + enum btree_iter_update_trigger_flags flags) { struct bkey_i deleted; @@ -144,12 +100,13 @@ static inline int bch2_key_trigger_old(struct btree_trans *trans, deleted.k.p = old.k->p; return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), - BTREE_TRIGGER_OVERWRITE|flags); + BTREE_TRIGGER_overwrite|flags); } static inline int bch2_key_trigger_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s new, unsigned flags) + enum btree_id btree_id, unsigned level, + struct bkey_s new, + enum btree_iter_update_trigger_flags flags) { struct bkey_i deleted; @@ -157,7 +114,7 @@ static inline int bch2_key_trigger_new(struct btree_trans *trans, deleted.k.p = new.k->p; return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_INSERT|flags); + BTREE_TRIGGER_insert|flags); } void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index bcca9e76a0b4..4536eb50fc40 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -6,9 +6,9 @@ #include "bset.h" #include "extents.h" -typedef int (*sort_cmp_fn)(struct btree *, - struct bkey_packed *, - struct bkey_packed *); +typedef int (*sort_cmp_fn)(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); static inline bool sort_iter_end(struct sort_iter *iter) { @@ -70,9 +70,9 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, /* * If keys compare equal, compare by pointer order: */ -static inline int key_sort_fix_overlapping_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static inline int key_sort_fix_overlapping_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { return bch2_bkey_cmp_packed(b, l, r) ?: cmp_int((unsigned long) l, (unsigned long) r); @@ -154,46 +154,59 @@ bch2_sort_repack(struct bset *dst, struct btree *src, return nr; } -static inline int sort_keys_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static inline int keep_unwritten_whiteouts_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { return bch2_bkey_cmp_packed_inlined(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: - (int) l->needs_whiteout - (int) r->needs_whiteout; + (long) l - (long) r; } -unsigned bch2_sort_keys(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) +#include "btree_update_interior.h" + +/* + * For sorting in the btree node write path: whiteouts not in the unwritten + * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are + * dropped if overwritten by real keys: + */ +unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter) { - const struct bkey_format *f = &iter->b->format; struct bkey_packed *in, *next, *out = dst; - sort_iter_sort(iter, sort_keys_cmp); + sort_iter_sort(iter, keep_unwritten_whiteouts_cmp); - while ((in = sort_iter_next(iter, sort_keys_cmp))) { - bool needs_whiteout = false; + while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) { + if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b)) + continue; - if (bkey_deleted(in) && - (filter_whiteouts || !in->needs_whiteout)) + if ((next = sort_iter_peek(iter)) && + !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) continue; - while ((next = sort_iter_peek(iter)) && - !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) { - BUG_ON(in->needs_whiteout && - next->needs_whiteout); - needs_whiteout |= in->needs_whiteout; - in = sort_iter_next(iter, sort_keys_cmp); - } + bkey_p_copy(out, in); + out = bkey_p_next(out); + } - if (bkey_deleted(in)) { - memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); - set_bkeyp_val_u64s(f, out, 0); - } else { - bkey_p_copy(out, in); - } - out->needs_whiteout |= needs_whiteout; + return (u64 *) out - (u64 *) dst; +} + +/* + * Main sort routine for compacting a btree node in memory: we always drop + * whiteouts because any whiteouts that need to be written are in the unwritten + * whiteouts area: + */ +unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter) +{ + struct bkey_packed *in, *out = dst; + + sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined); + + while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) { + if (bkey_deleted(in)) + continue; + + bkey_p_copy(out, in); out = bkey_p_next(out); } diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h index 7c0f0b160f18..9be969d46890 100644 --- a/fs/bcachefs/bkey_sort.h +++ b/fs/bcachefs/bkey_sort.h @@ -48,7 +48,7 @@ bch2_sort_repack(struct bset *, struct btree *, struct btree_node_iter *, struct bkey_format *, bool); -unsigned bch2_sort_keys(struct bkey_packed *, - struct sort_iter *, bool); +unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *); +unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *); #endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 3bb477840eab..575e1d0b6eeb 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -103,8 +103,6 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) { - struct bset_tree *t; - console_lock(); for_each_bset(b, t) bch2_dump_bset(c, b, bset(b, t), t - b->set); @@ -136,7 +134,6 @@ void bch2_dump_btree_node_iter(struct btree *b, struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) { - struct bset_tree *t; struct bkey_packed *k; struct btree_nr_keys nr = {}; @@ -198,7 +195,6 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, { struct btree_node_iter_set *set, *s2; struct bkey_packed *k, *p; - struct bset_tree *t; if (bch2_btree_node_iter_end(iter)) return; @@ -213,12 +209,14 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, /* Verify that set->end is correct: */ btree_node_iter_for_each(iter, set) { for_each_bset(b, t) - if (set->end == t->end_offset) + if (set->end == t->end_offset) { + BUG_ON(set->k < btree_bkey_first_offset(t) || + set->k >= t->end_offset); goto found; + } BUG(); found: - BUG_ON(set->k < btree_bkey_first_offset(t) || - set->k >= t->end_offset); + do {} while (0); } /* Verify iterator is sorted: */ @@ -377,11 +375,9 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(const struct btree *b) +static void bset_aux_tree_verify(struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG - const struct bset_tree *t; - for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) continue; @@ -685,20 +681,20 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, } /* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t) { bset_aux_tree_verify(b); return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); } -static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / (sizeof(struct bkey_float) + sizeof(u8)); } -static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); } @@ -1374,8 +1370,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, struct btree *b) { - struct bset_tree *t; - memset(iter, 0, sizeof(*iter)); for_each_bset(b, t) @@ -1481,7 +1475,6 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, { struct bkey_packed *k, *prev = NULL; struct btree_node_iter_set *set; - struct bset_tree *t; unsigned end = 0; if (bch2_expensive_debug_checks) @@ -1550,9 +1543,7 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) { - const struct bset_tree *t; - - for_each_bset(b, t) { + for_each_bset_c(b, t) { enum bset_aux_tree_type type = bset_aux_tree_type(t); size_t j; diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 120a79fd456b..5c6c7a14fa0f 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -206,7 +206,10 @@ static inline size_t btree_aux_data_u64s(const struct btree *b) } #define for_each_bset(_b, _t) \ - for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + +#define for_each_bset_c(_b, _t) \ + for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) #define bset_tree_for_each_key(_b, _t, _k) \ for (_k = btree_bkey_first(_b, _t); \ @@ -294,7 +297,6 @@ static inline struct bset_tree * bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) { unsigned offset = __btree_node_key_to_offset(b, k); - struct bset_tree *t; for_each_bset(b, t) if (offset <= t->end_offset) { diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 02c70e813fac..9e4ed75d3675 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -16,6 +16,12 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> +#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ +do { \ + if (shrinker_counter) \ + bc->not_freed_##counter++; \ +} while (0) + const char * const bch2_btree_node_flags[] = { #define x(f) #f, BTREE_FLAGS() @@ -162,6 +168,9 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) /* Cause future lookups for this node to fail: */ b->hash_val = 0; + + if (b->c.btree_id < BTREE_ID_NR) + --bc->used_by_btree[b->c.btree_id]; } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -169,8 +178,11 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) BUG_ON(b->hash_val); b->hash_val = btree_ptr_hash_val(&b->key); - return rhashtable_lookup_insert_fast(&bc->table, &b->hash, - bch_btree_cache_params); + int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, + bch_btree_cache_params); + if (!ret && b->c.btree_id < BTREE_ID_NR) + bc->used_by_btree[b->c.btree_id]++; + return ret; } int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, @@ -190,6 +202,35 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, return ret; } +void bch2_btree_node_update_key_early(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_i *new) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bkey_buf tmp; + int ret; + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, old); + + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + + bch2_bkey_buf_exit(&tmp, c); +} + __flatten static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) @@ -203,7 +244,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) { struct btree_cache *bc = &c->btree_cache; int ret = 0; @@ -225,38 +266,64 @@ wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_dirty(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + else if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); return -BCH_ERR_ENOMEM_btree_node_reclaim; + } /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } - if (!six_trylock_intent(&b->c.lock)) + if (!six_trylock_intent(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); return -BCH_ERR_ENOMEM_btree_node_reclaim; + } - if (!six_trylock_write(&b->c.lock)) + if (!six_trylock_write(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); goto out_unlock_intent; + } /* recheck under lock */ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); goto out_unlock; + } six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; } - if (btree_node_noevict(b) || - btree_node_write_blocked(b) || - btree_node_will_make_reachable(b)) + if (btree_node_noevict(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(noevict); + goto out_unlock; + } + if (btree_node_write_blocked(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); goto out_unlock; + } + if (btree_node_will_make_reachable(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); + goto out_unlock; + } if (btree_node_dirty(b)) { - if (!flush) + if (!flush) { + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); goto out_unlock; + } /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -286,14 +353,14 @@ out_unlock_intent: goto out; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) { - return __btree_node_reclaim(c, b, false); + return __btree_node_reclaim(c, b, false, shrinker_counter); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true); + return __btree_node_reclaim(c, b, true, false); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -341,11 +408,12 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b, true)) { btree_node_data_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; + bc->freed++; } } restart: @@ -354,9 +422,11 @@ restart: if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - } else if (!btree_node_reclaim(c, b)) { + bc->not_freed_access_bit++; + } else if (!btree_node_reclaim(c, b, true)) { freed++; btree_node_data_free(c, b); + bc->freed++; bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); @@ -564,7 +634,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree *b; list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b)) + if (!btree_node_reclaim(c, b, false)) return b; while (1) { @@ -600,7 +670,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b, false)) { list_del_init(&b->list); goto got_node; } @@ -626,7 +696,7 @@ got_node: * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2)) { + if (!btree_node_reclaim(c, b2, false)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); btree_node_to_freedlist(bc, b2); @@ -846,7 +916,6 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; bool need_relock = false; int ret; @@ -966,7 +1035,6 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * { struct bch_fs *c = trans->c; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -1043,7 +1111,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -1240,9 +1307,39 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc stats.failed); } -void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) +static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, + const char *label, unsigned nr) { - prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); - prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); + prt_printf(out, "%s\t", label); + prt_human_readable_u64(out, nr * c->opts.btree_node_size); + prt_printf(out, " (%u)\n", nr); +} + +void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_btree_cache_line(out, c, "total:", bc->used); + prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty)); + prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + prt_newline(out); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++) + prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]); + + prt_newline(out); + prt_printf(out, "freed:\t%u\n", bc->freed); + prt_printf(out, "not freed:\n"); + prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty); + prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight); + prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight); + prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent); + prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write); + prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit); + prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict); + prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked); + prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 6d33885fdbde..fed35de3e4de 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -17,6 +17,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); +void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *); + void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); @@ -131,6 +134,6 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) const char *bch2_btree_id_str(enum btree_id); void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *); +void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 791470b0c654..8035c8b797ab 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -52,12 +52,6 @@ static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) }}}; } -static bool should_restart_for_topology_repair(struct bch_fs *c) -{ - return c->opts.fix_errors != FSCK_FIX_no && - !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); -} - static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { preempt_disable(); @@ -69,7 +63,7 @@ static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); + BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0); __gc_pos_set(c, new_pos); } @@ -97,35 +91,6 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) } } -static void bch2_btree_node_update_key_early(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_i *new) -{ - struct bch_fs *c = trans->c; - struct btree *b; - struct bkey_buf tmp; - int ret; - - bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_reassemble(&tmp, c, old); - - b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); - if (!IS_ERR_OR_NULL(b)) { - mutex_lock(&c->btree_cache.lock); - - bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - - mutex_unlock(&c->btree_cache.lock); - six_unlock_read(&b->c.lock); - } - - bch2_bkey_buf_exit(&tmp, c); -} - static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) { struct bkey_i_btree_ptr_v2 *new; @@ -546,9 +511,9 @@ reconstruct_root: if (!bch2_btree_has_scanned_nodes(c, i)) { mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing, "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); - bch2_btree_root_alloc_fake(c, i, 0); + bch2_btree_root_alloc_fake_trans(trans, i, 0); } else { - bch2_btree_root_alloc_fake(c, i, 1); + bch2_btree_root_alloc_fake_trans(trans, i, 1); bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); if (ret) @@ -576,7 +541,7 @@ reconstruct_root: goto reconstruct_root; bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); - bch2_btree_root_alloc_fake(c, i, 0); + bch2_btree_root_alloc_fake_trans(trans, i, 0); r->alive = false; ret = 0; } @@ -586,495 +551,123 @@ fsck_err: return ret; } -static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, bool is_root, - struct bkey_s_c *k) +/* marking of btree keys/nodes: */ + +static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, struct btree **prev, + struct btree_iter *iter, struct bkey_s_c k, + bool initial) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); - const union bch_extent_entry *entry_c; - struct extent_ptr_decoded p = { 0 }; - bool do_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - /* - * XXX - * use check_bucket_ref here - */ - bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c); - - if (fsck_err_on(!g->gen_valid, - c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - do_update = true; - } - } - - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - set_bit(BCH_FS_need_another_gc, &c->flags); - } else { - do_update = true; - } - } - - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - continue; - - if (fsck_err_on(bucket_data_type(g->data_type) && - bucket_data_type(g->data_type) != - bucket_data_type(data_type), c, - ptr_bucket_data_type_mismatch, - "bucket %u:%zu different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->data_type = data_type; - set_bit(BCH_FS_need_another_gc, &c->flags); - } else { - do_update = true; - } - } - - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, c, - ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, - ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - } - } - - if (do_update) { - if (is_root) { - bch_err(c, "cannot update btree roots yet"); - ret = -EINVAL; - goto err; - } - - struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); - if (!new) { - ret = -BCH_ERR_ENOMEM_gc_repair_key; - bch_err_msg(c, ret, "allocating new key"); - goto err; - } - - bkey_reassemble(new, *k); - - if (level) { - /* - * We don't want to drop btree node pointers - if the - * btree node isn't there anymore, the read path will - * sort it out: - */ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - ptr->gen = g->gen; - } - } else { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); - - if ((p.ptr.cached && - (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || - (!p.ptr.cached && - gen_cmp(p.ptr.gen, g->gen) < 0) || - gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type)) { - bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); - goto restart_drop_ptrs; - } - } -again: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_extent_entry_for_each(ptrs, entry) { - if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, - entry->stripe_ptr.idx); - union bch_extent_entry *next_ptr; - - bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) - if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) - goto found; - next_ptr = NULL; -found: - if (!next_ptr) { - bch_err(c, "aieee, found stripe ptr with no data ptr"); - continue; - } - - if (!m || !m->alive || - !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], - &next_ptr->ptr, - m->sectors)) { - bch2_bkey_extent_entry_drop(new, entry); - goto again; - } - } - } - } - if (level) - bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); + if (iter) { + struct btree_path *path = btree_iter_path(trans, iter); + struct btree *b = path_l(path)->b; - if (0) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, *k); - bch_info(c, "updated %s", buf.buf); - - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - bch_info(c, "new key %s", buf.buf); - } - - ret = bch2_journal_key_insert_take(c, btree_id, level, new); - if (ret) { - kfree(new); - goto err; + if (*prev != b) { + int ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; } - - *k = bkey_i_to_s_c(new); + *prev = b; } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} -/* marking of btree keys/nodes: */ - -static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, bool is_root, - struct bkey_s_c *k, - bool initial) -{ - struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; struct printbuf buf = PRINTBUF; int ret = 0; - deleted.p = k->k->p; + deleted.p = k.k->p; if (initial) { BUG_ON(bch2_journal_seq_verify && - k->k->version.lo > atomic64_read(&c->journal.seq)); + k.k->version.lo > atomic64_read(&c->journal.seq)); - if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, + if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, "key version number higher than recorded: %llu > %llu", - k->k->version.lo, + k.k->version.lo, atomic64_read(&c->key_version))) - atomic64_set(&c->key_version, k->k->version.lo); + atomic64_set(&c->key_version, k.k->version.lo); } - ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); - if (ret) - goto err; - - if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k), + if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), c, btree_bitmap_not_marked, "btree ptr not marked in member info btree allocated bitmap\n %s", - (bch2_bkey_val_to_text(&buf, c, *k), + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { mutex_lock(&c->sb_lock); - bch2_dev_btree_bitmap_mark(c, *k); + bch2_dev_btree_bitmap_mark(c, k); bch2_write_super(c); mutex_unlock(&c->sb_lock); } - ret = commit_do(trans, NULL, NULL, 0, - bch2_key_trigger(trans, btree_id, level, old, - unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); -fsck_err: -err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) -{ - struct btree_node_iter iter; - struct bkey unpacked; - struct bkey_s_c k; - int ret = 0; + /* + * We require a commit before key_trigger() because + * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the + * wrong result if we run it multiple times. + */ + unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0; - ret = bch2_btree_node_check_topology(trans, b); + ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), + BTREE_TRIGGER_check_repair|flags); if (ret) - return ret; - - if (!btree_node_type_needs_gc(btree_node_type(b))) - return 0; - - bch2_btree_node_iter_init_from_start(&iter, b); - - while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, - &k, initial); - if (ret) - return ret; + goto out; - bch2_btree_node_iter_advance(&iter, b); + if (trans->nr_updates) { + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; } - return 0; + ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), + BTREE_TRIGGER_gc|flags); +out: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; } -static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, - bool initial, bool metadata_only) +static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct btree *b; - unsigned depth = metadata_only ? 1 : 0; + int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1; int ret = 0; - gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); - - __for_each_btree_node(trans, iter, btree_id, POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b, ret) { - bch2_verify_btree_nr_keys(b); - - gc_pos_set(c, gc_pos_btree_node(b)); - - ret = btree_gc_mark_node(trans, b, initial); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; + /* We need to make sure every leaf node is readable before going RW */ + if (initial) + target_depth = 0; + /* root */ mutex_lock(&c->btree_root_lock); - b = bch2_btree_id_root(c, btree_id)->b; + struct btree *b = bch2_btree_id_root(c, btree)->b; if (!btree_node_fake(b)) { - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, - true, &k, initial); + gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); + ret = lockrestart_do(trans, + bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, + NULL, NULL, bkey_i_to_s_c(&b->key), initial)); + level = b->c.level; } - gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); mutex_unlock(&c->btree_root_lock); - return ret; -} - -static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, - unsigned target_depth) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf cur; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_btree_node_check_topology(trans, b); if (ret) return ret; - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - bch2_bkey_buf_init(&cur); - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); + for (; level >= target_depth; --level) { + struct btree *prev = NULL; + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level, + BTREE_ITER_prefetch); - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, - false, &k, true); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); + bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); + })); if (ret) - goto fsck_err; - - bch2_btree_and_journal_iter_advance(&iter); - } - - if (b->c.level > target_depth) { - bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - iter.prefetch = true; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - struct btree *child; - - bch2_bkey_buf_reassemble(&cur, c, k); - bch2_btree_and_journal_iter_advance(&iter); - - child = bch2_btree_node_get_noiter(trans, cur.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(child); - - if (bch2_err_matches(ret, EIO)) { - bch2_topology_error(c); - - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - btree_node_read_error, - "Unreadable btree node at btree %s level %u:\n" - " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level - 1, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && - should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto fsck_err; - } else { - /* Continue marking when opted to not - * fix the error: */ - ret = 0; - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - continue; - } - } else if (ret) { - bch_err_msg(c, ret, "getting btree node"); - break; - } - - ret = bch2_gc_btree_init_recurse(trans, child, - target_depth); - six_unlock_read(&child->c.lock); - - if (ret) - break; - } - } -fsck_err: - bch2_bkey_buf_exit(&cur, c); - bch2_btree_and_journal_iter_exit(&iter); - printbuf_exit(&buf); - return ret; -} - -static int bch2_gc_btree_init(struct btree_trans *trans, - enum btree_id btree_id, - bool metadata_only) -{ - struct bch_fs *c = trans->c; - struct btree *b; - unsigned target_depth = metadata_only ? 1 : 0; - struct printbuf buf = PRINTBUF; - int ret = 0; - - b = bch2_btree_id_root(c, btree_id)->b; - - six_lock_read(&b->c.lock, NULL, NULL); - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->min_key); - if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, - btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf)) { - bch_err(c, "repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto fsck_err; - } - - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->max_key); - if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, - btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf)) { - bch_err(c, "repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto fsck_err; - } - - if (b->c.level >= target_depth) - ret = bch2_gc_btree_init_recurse(trans, b, target_depth); - - if (!ret) { - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, - &k, true); + break; } -fsck_err: - six_unlock_read(&b->c.lock); - bch_err_fn(c, ret); - printbuf_exit(&buf); return ret; } @@ -1084,7 +677,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) (int) btree_id_to_gc_phase(r); } -static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) +static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id ids[BTREE_ID_NR]; @@ -1095,98 +688,36 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - for (i = 0; i < BTREE_ID_NR && !ret; i++) - ret = initial - ? bch2_gc_btree_init(trans, ids[i], metadata_only) - : bch2_gc_btree(trans, ids[i], initial, metadata_only); + for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + unsigned btree = i < BTREE_ID_NR ? ids[i] : i; - for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { - if (!bch2_btree_id_root(c, i)->alive) + if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = initial - ? bch2_gc_btree_init(trans, i, metadata_only) - : bch2_gc_btree(trans, i, initial, metadata_only); - } + ret = bch2_gc_btree(trans, btree, true); + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), + c, btree_node_read_error, + "btree node read error for %s", + bch2_btree_id_str(btree))) + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + } +fsck_err: bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } -static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, - u64 start, u64 end, - enum bch_data_type type, - unsigned flags) -{ - u64 b = sector_to_bucket(ca, start); - - do { - unsigned sectors = - min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - - bch2_mark_metadata_bucket(c, ca, b, type, sectors, - gc_phase(GC_PHASE_SB), flags); - b++; - start += sectors; - } while (start < end); -} - -static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, - unsigned flags) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - unsigned i; - u64 b; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - - if (offset == BCH_SB_SECTOR) - mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, - BCH_DATA_sb, flags); - - mark_metadata_sectors(c, ca, offset, - offset + (1 << layout->sb_max_size_bits), - BCH_DATA_sb, flags); - } - - for (i = 0; i < ca->journal.nr; i++) { - b = ca->journal.buckets[i]; - bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), flags); - } -} - -static void bch2_mark_superblocks(struct bch_fs *c) +static int bch2_mark_superblocks(struct bch_fs *c) { mutex_lock(&c->sb_lock); gc_pos_set(c, gc_phase(GC_PHASE_SB)); - for_each_online_member(c, ca) - bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); + int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); mutex_unlock(&c->sb_lock); + return ret; } -#if 0 -/* Also see bch2_pending_btree_node_free_insert_done() */ -static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) -{ - struct btree_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&c->btree_interior_update_lock); - gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); - - for_each_pending_btree_node_free(c, as, d) - if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); - - mutex_unlock(&c->btree_interior_update_lock); -} -#endif - static void bch2_gc_free(struct bch_fs *c) { genradix_free(&c->reflink_gc_table); @@ -1204,28 +735,23 @@ static void bch2_gc_free(struct bch_fs *c) c->usage_gc = NULL; } -static int bch2_gc_done(struct bch_fs *c, - bool initial, bool metadata_only) +static int bch2_gc_done(struct bch_fs *c) { struct bch_dev *ca = NULL; struct printbuf buf = PRINTBUF; - bool verify = !metadata_only && - !c->opts.reconstruct_alloc && - (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); unsigned i; int ret = 0; percpu_down_write(&c->mark_lock); -#define copy_field(_err, _f, _msg, ...) \ - if (dst->_f != src->_f && \ - (!verify || \ - fsck_err(c, _err, _msg ": got %llu, should be %llu" \ - , ##__VA_ARGS__, dst->_f, src->_f))) \ +#define copy_field(_err, _f, _msg, ...) \ + if (fsck_err_on(dst->_f != src->_f, c, _err, \ + _msg ": got %llu, should be %llu" , ##__VA_ARGS__, \ + dst->_f, src->_f)) \ dst->_f = src->_f -#define copy_dev_field(_err, _f, _msg, ...) \ +#define copy_dev_field(_err, _f, _msg, ...) \ copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__) -#define copy_fs_field(_err, _f, _msg, ...) \ +#define copy_fs_field(_err, _f, _msg, ...) \ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) for (i = 0; i < ARRAY_SIZE(c->usage); i++) @@ -1258,31 +784,24 @@ static int bch2_gc_done(struct bch_fs *c, copy_fs_field(fs_usage_btree_wrong, b.btree, "btree"); - if (!metadata_only) { - copy_fs_field(fs_usage_data_wrong, - b.data, "data"); - copy_fs_field(fs_usage_cached_wrong, - b.cached, "cached"); - copy_fs_field(fs_usage_reserved_wrong, - b.reserved, "reserved"); - copy_fs_field(fs_usage_nr_inodes_wrong, - b.nr_inodes,"nr_inodes"); - - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(fs_usage_persistent_reserved_wrong, - persistent_reserved[i], - "persistent_reserved[%i]", i); - } + copy_fs_field(fs_usage_data_wrong, + b.data, "data"); + copy_fs_field(fs_usage_cached_wrong, + b.cached, "cached"); + copy_fs_field(fs_usage_reserved_wrong, + b.reserved, "reserved"); + copy_fs_field(fs_usage_nr_inodes_wrong, + b.nr_inodes,"nr_inodes"); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(fs_usage_persistent_reserved_wrong, + persistent_reserved[i], + "persistent_reserved[%i]", i); for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - if (metadata_only && - (e->data_type == BCH_DATA_user || - e->data_type == BCH_DATA_cached)) - continue; - printbuf_reset(&buf); bch2_replicas_entry_to_text(&buf, e); @@ -1296,10 +815,8 @@ static int bch2_gc_done(struct bch_fs *c, #undef copy_stripe_field #undef copy_field fsck_err: - if (ca) - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err_fn(c, ret); - percpu_up_write(&c->mark_lock); printbuf_exit(&buf); return ret; @@ -1322,7 +839,7 @@ static int bch2_gc_start(struct bch_fs *c) ca->usage_gc = alloc_percpu(struct bch_dev_usage); if (!ca->usage_gc) { bch_err(c, "error allocating ca->usage_gc"); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return -BCH_ERR_ENOMEM_gc_start; } @@ -1333,19 +850,6 @@ static int bch2_gc_start(struct bch_fs *c) return 0; } -static int bch2_gc_reset(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - free_percpu(ca->usage_gc); - ca->usage_gc = NULL; - } - - free_percpu(c->usage_gc); - c->usage_gc = NULL; - - return bch2_gc_start(c); -} - /* returns true if not equal */ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, struct bch_alloc_v4 r) @@ -1361,56 +865,41 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, static int bch2_alloc_write_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - bool metadata_only) + struct bch_dev *ca, + struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket old_gc, gc, *b; struct bkey_i_alloc_v4 *a; - struct bch_alloc_v4 old_convert, new; + struct bch_alloc_v4 old_gc, gc, old_convert, new; const struct bch_alloc_v4 *old; int ret; old = bch2_alloc_to_v4(k, &old_convert); - new = *old; + gc = new = *old; percpu_down_read(&c->mark_lock); - b = gc_bucket(ca, iter->pos.offset); - old_gc = *b; + __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); + + old_gc = gc; if ((old->data_type == BCH_DATA_sb || old->data_type == BCH_DATA_journal) && !bch2_dev_is_online(ca)) { - b->data_type = old->data_type; - b->dirty_sectors = old->dirty_sectors; + gc.data_type = old->data_type; + gc.dirty_sectors = old->dirty_sectors; } /* - * b->data_type doesn't yet include need_discard & need_gc_gen states - + * gc.data_type doesn't yet include need_discard & need_gc_gen states - * fix that here: */ - b->data_type = __alloc_data_type(b->dirty_sectors, - b->cached_sectors, - b->stripe, - *old, - b->data_type); - gc = *b; + alloc_data_type_set(&gc, gc.data_type); if (gc.data_type != old_gc.data_type || gc.dirty_sectors != old_gc.dirty_sectors) - bch2_dev_usage_update_m(c, ca, &old_gc, &gc); + bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true); percpu_up_read(&c->mark_lock); - if (metadata_only && - gc.data_type != BCH_DATA_sb && - gc.data_type != BCH_DATA_journal && - gc.data_type != BCH_DATA_btree) - return 0; - - if (gen_after(old->gen, gc.gen)) - return 0; - if (fsck_err_on(new.data_type != gc.data_type, c, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" @@ -1460,12 +949,12 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); + ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun); fsck_err: return ret; } -static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_alloc_done(struct bch_fs *c) { int ret = 0; @@ -1474,11 +963,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), POS(ca->dev_idx, ca->mi.nbuckets - 1), - BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + BTREE_ITER_slots|BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - bch2_alloc_write_key(trans, &iter, k, metadata_only))); + bch2_alloc_write_key(trans, &iter, ca, k))); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); break; } } @@ -1487,14 +976,14 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) return ret; } -static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) +static int bch2_gc_alloc_start(struct bch_fs *c) { for_each_member_device(c, ca) { struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err(c, "error allocating ca->buckets[gc]"); return -BCH_ERR_ENOMEM_gc_alloc_start; } @@ -1504,54 +993,29 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) rcu_assign_pointer(ca->buckets_gc, buckets); } + struct bch_dev *ca = NULL; int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); - struct bucket *g = gc_bucket(ca, k.k->p.offset); + BTREE_ITER_prefetch, k, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + struct bucket *g = gc_bucket(ca, k.k->p.offset); g->gen_valid = 1; g->gen = a->gen; - - if (metadata_only && - (a->data_type == BCH_DATA_user || - a->data_type == BCH_DATA_cached || - a->data_type == BCH_DATA_parity)) { - g->data_type = a->data_type; - g->dirty_sectors = a->dirty_sectors; - g->cached_sectors = a->cached_sectors; - g->stripe = a->stripe; - g->stripe_redundancy = a->stripe_redundancy; - } - 0; }))); + bch2_dev_put(ca); bch_err_fn(c, ret); return ret; } -static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) -{ - for_each_member_device(c, ca) { - struct bucket_array *buckets = gc_bucket_array(ca); - struct bucket *g; - - for_each_bucket(g, buckets) { - if (metadata_only && - (g->data_type == BCH_DATA_user || - g->data_type == BCH_DATA_cached || - g->data_type == BCH_DATA_parity)) - continue; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } - } -} - static int bch2_gc_write_reflink_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1601,35 +1065,27 @@ fsck_err: return ret; } -static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_reflink_done(struct bch_fs *c) { size_t idx = 0; - if (metadata_only) - return 0; - int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_gc_write_reflink_key(trans, &iter, k, &idx))); c->reflink_gc_nr = 0; return ret; } -static int bch2_gc_reflink_start(struct bch_fs *c, - bool metadata_only) +static int bch2_gc_reflink_start(struct bch_fs *c) { - - if (metadata_only) - return 0; - c->reflink_gc_nr = 0; int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ const __le64 *refcount = bkey_refcount_c(k); if (!refcount) @@ -1652,15 +1108,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, return ret; } -static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) -{ - struct genradix_iter iter; - struct reflink_gc *r; - - genradix_for_each(&c->reflink_gc_table, iter, r) - r->refcount = 0; -} - static int bch2_gc_write_stripes_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) @@ -1714,30 +1161,20 @@ fsck_err: return ret; } -static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_stripes_done(struct bch_fs *c) { - if (metadata_only) - return 0; - return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_gc_write_stripes_key(trans, &iter, k))); } -static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) -{ - genradix_free(&c->gc_stripes); -} - /** - * bch2_gc - walk _all_ references to buckets, and recompute them: + * bch2_check_allocations - walk all references to buckets, and recompute them: * * @c: filesystem object - * @initial: are we in recovery? - * @metadata_only: are we just checking metadata references, or everything? * * Returns: 0 on success, or standard errcode on failure * @@ -1756,9 +1193,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) +int bch2_check_allocations(struct bch_fs *c) { - unsigned iter = 0; int ret; lockdep_assert_held(&c->state_lock); @@ -1768,62 +1204,30 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) bch2_btree_interior_updates_flush(c); ret = bch2_gc_start(c) ?: - bch2_gc_alloc_start(c, metadata_only) ?: - bch2_gc_reflink_start(c, metadata_only); + bch2_gc_alloc_start(c) ?: + bch2_gc_reflink_start(c); if (ret) goto out; -again: - gc_pos_set(c, gc_phase(GC_PHASE_START)); - bch2_mark_superblocks(c); + gc_pos_set(c, gc_phase(GC_PHASE_START)); - ret = bch2_gc_btrees(c, initial, metadata_only); + ret = bch2_mark_superblocks(c); + BUG_ON(ret); + ret = bch2_gc_btrees(c); if (ret) goto out; -#if 0 - bch2_mark_pending_btree_node_frees(c); -#endif c->gc_count++; - if (test_bit(BCH_FS_need_another_gc, &c->flags) || - (!iter && bch2_test_restart_gc)) { - if (iter++ > 2) { - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; - goto out; - } - - /* - * XXX: make sure gens we fixed got saved - */ - bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_need_another_gc, &c->flags); - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); - - bch2_gc_stripes_reset(c, metadata_only); - bch2_gc_alloc_reset(c, metadata_only); - bch2_gc_reflink_reset(c, metadata_only); - ret = bch2_gc_reset(c); - if (ret) - goto out; - - /* flush fsck errors, reset counters */ - bch2_flush_fsck_errs(c); - goto again; - } + bch2_journal_block(&c->journal); out: - if (!ret) { - bch2_journal_block(&c->journal); + ret = bch2_gc_alloc_done(c) ?: + bch2_gc_done(c) ?: + bch2_gc_stripes_done(c) ?: + bch2_gc_reflink_done(c); - ret = bch2_gc_alloc_done(c, metadata_only) ?: - bch2_gc_done(c, initial, metadata_only) ?: - bch2_gc_stripes_done(c, metadata_only) ?: - bch2_gc_reflink_done(c, metadata_only); - - bch2_journal_unblock(&c->journal); - } + bch2_journal_unblock(&c->journal); percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ @@ -1852,23 +1256,33 @@ static int gc_btree_gens_key(struct btree_trans *trans, struct bkey_i *u; int ret; + if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) + return -EROFS; + percpu_down_read(&c->mark_lock); + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; - if (ptr_stale(ca, ptr) > 16) { + if (dev_ptr_stale(ca, ptr) > 16) { + rcu_read_unlock(); percpu_up_read(&c->mark_lock); goto update; } } bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; if (gen_after(*gen, ptr->gen)) *gen = ptr->gen; } + rcu_read_unlock(); percpu_up_read(&c->mark_lock); return 0; update: @@ -1881,10 +1295,9 @@ update: return 0; } -static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca, + struct btree_iter *iter, struct bkey_s_c k) { - struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); struct bkey_i_alloc_v4 *a_mut; @@ -1899,7 +1312,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i return ret; a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); + alloc_data_type_set(&a_mut->v, a_mut->v.data_type); return bch2_trans_update(trans, iter, &a_mut->k_i, 0); } @@ -1927,7 +1340,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); ret = -BCH_ERR_ENOMEM_gc_gens; goto err; } @@ -1945,7 +1358,7 @@ int bch2_gc_gens(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, i, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -1954,14 +1367,23 @@ int bch2_gc_gens(struct bch_fs *c) goto err; } + struct bch_dev *ca = NULL; ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, + BTREE_ITER_prefetch, k, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_alloc_write_oldest_gen(trans, &iter, k))); + BCH_TRANS_COMMIT_no_enospc, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + bch2_alloc_write_oldest_gen(trans, ca, &iter, k); + }))); + bch2_dev_put(ca); + if (ret) goto err; @@ -1985,87 +1407,23 @@ err: return ret; } -static int bch2_gc_thread(void *arg) +static void bch2_gc_gens_work(struct work_struct *work) { - struct bch_fs *c = arg; - struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last = atomic64_read(&clock->now); - unsigned last_kick = atomic_read(&c->kick_gc); - - set_freezable(); - - while (1) { - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - return 0; - } - - if (atomic_read(&c->kick_gc) != last_kick) - break; - - if (c->btree_gc_periodic) { - unsigned long next = last + c->capacity / 16; - - if (atomic64_read(&clock->now) >= next) - break; - - bch2_io_clock_schedule_timeout(clock, next); - } else { - schedule(); - } - - try_to_freeze(); - } - __set_current_state(TASK_RUNNING); - - last = atomic64_read(&clock->now); - last_kick = atomic_read(&c->kick_gc); - - /* - * Full gc is currently incompatible with btree key cache: - */ -#if 0 - ret = bch2_gc(c, false, false); -#else - bch2_gc_gens(c); -#endif - debug_check_no_locks_held(); - } - - return 0; + struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); + bch2_gc_gens(c); + bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -void bch2_gc_thread_stop(struct bch_fs *c) +void bch2_gc_gens_async(struct bch_fs *c) { - struct task_struct *p; - - p = c->gc_thread; - c->gc_thread = NULL; - - if (p) { - kthread_stop(p); - put_task_struct(p); - } + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && + !queue_work(c->write_ref_wq, &c->gc_gens_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -int bch2_gc_thread_start(struct bch_fs *c) +void bch2_fs_gc_init(struct bch_fs *c) { - struct task_struct *p; + seqcount_init(&c->gc_pos_lock); - if (c->gc_thread) - return 0; - - p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); - if (IS_ERR(p)) { - bch_err_fn(c, PTR_ERR(p)); - return PTR_ERR(p); - } - - get_task_struct(p); - c->gc_thread = p; - wake_up_process(p); - return 0; + INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 607575f83a00..1b6489d8e0f4 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -6,10 +6,7 @@ #include "btree_types.h" int bch2_check_topology(struct bch_fs *); -int bch2_gc(struct bch_fs *, bool, bool); -int bch2_gc_gens(struct bch_fs *); -void bch2_gc_thread_stop(struct bch_fs *); -int bch2_gc_thread_start(struct bch_fs *); +int bch2_check_allocations(struct bch_fs *); /* * For concurrent mark and sweep (with other index updates), we define a total @@ -37,16 +34,16 @@ static inline struct gc_pos gc_phase(enum gc_phase phase) { return (struct gc_pos) { .phase = phase, - .pos = POS_MIN, .level = 0, + .pos = POS_MIN, }; } static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) { - return cmp_int(l.phase, r.phase) ?: - bpos_cmp(l.pos, r.pos) ?: - cmp_int(l.level, r.level); + return cmp_int(l.phase, r.phase) ?: + -cmp_int(l.level, r.level) ?: + bpos_cmp(l.pos, r.pos); } static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) @@ -60,13 +57,13 @@ static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) } } -static inline struct gc_pos gc_pos_btree(enum btree_id id, - struct bpos pos, unsigned level) +static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, + struct bpos pos) { return (struct gc_pos) { - .phase = btree_id_to_gc_phase(id), - .pos = pos, + .phase = btree_id_to_gc_phase(btree), .level = level, + .pos = pos, }; } @@ -76,19 +73,7 @@ static inline struct gc_pos gc_pos_btree(enum btree_id id, */ static inline struct gc_pos gc_pos_btree_node(struct btree *b) { - return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); -} - -/* - * GC position of the pointer to a btree root: we don't use - * gc_pos_pointer_to_btree_node() here to avoid a potential race with - * btree_split() increasing the tree depth - the new root will have level > the - * old root and thus have a greater gc position than the old root, but that - * would be incorrect since once gc has marked the root it's not coming back. - */ -static inline struct gc_pos gc_pos_btree_root(enum btree_id id) -{ - return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); + return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p); } static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) @@ -104,11 +89,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) return ret; } -static inline void bch2_do_gc_gens(struct bch_fs *c) -{ - atomic_inc(&c->kick_gc); - if (c->gc_thread) - wake_up_process(c->gc_thread); -} +int bch2_gc_gens(struct bch_fs *); +void bch2_gc_gens_async(struct bch_fs *); +void bch2_fs_gc_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index debb0edc3455..cbf8f5d90602 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -23,6 +23,18 @@ #include <linux/sched/mm.h> +static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) +{ + prt_printf(out, "btree=%s l=%u seq %llux\n", + bch2_btree_id_str(BTREE_NODE_ID(bn)), + (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq); + prt_str(out, "min: "); + bch2_bpos_to_text(out, bn->min_key); + prt_newline(out); + prt_str(out, "max: "); + bch2_bpos_to_text(out, bn->max_key); +} + void bch2_btree_node_io_unlock(struct btree *b) { EBUG_ON(!btree_node_write_in_flight(b)); @@ -217,7 +229,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t, static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) { - struct bset_tree *t; bool ret = false; for_each_bset(b, t) { @@ -288,8 +299,7 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, static void btree_node_sort(struct bch_fs *c, struct btree *b, unsigned start_idx, - unsigned end_idx, - bool filter_whiteouts) + unsigned end_idx) { struct btree_node *out; struct sort_iter_stack sort_iter; @@ -320,7 +330,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_time = local_clock(); - u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts); + u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter); out->keys.u64s = cpu_to_le16(u64s); @@ -426,13 +436,12 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b) break; if (b->nsets - unwritten_idx > 1) { - btree_node_sort(c, b, unwritten_idx, - b->nsets, false); + btree_node_sort(c, b, unwritten_idx, b->nsets); ret = true; } if (unwritten_idx > 1) { - btree_node_sort(c, b, 0, unwritten_idx, false); + btree_node_sort(c, b, 0, unwritten_idx); ret = true; } @@ -441,8 +450,6 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b) void bch2_btree_build_aux_trees(struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) bch2_bset_build_aux_tree(b, t, !bset_written(b, bset(b, t)) && @@ -524,7 +531,9 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_printf(out, "at btree "); bch2_btree_pos_to_text(out, c, b); - prt_printf(out, "\n node offset %u/%u", + printbuf_indent_add(out, 2); + + prt_printf(out, "\nnode offset %u/%u", b->written, btree_ptr_sectors_written(&b->key)); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); @@ -543,6 +552,7 @@ static int __btree_err(int ret, const char *fmt, ...) { struct printbuf out = PRINTBUF; + bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; va_list args; btree_err_msg(&out, c, ca, b, i, b->written, write); @@ -564,12 +574,14 @@ static int __btree_err(int ret, if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) ret = -BCH_ERR_btree_node_read_err_bad_node; - if (ret != -BCH_ERR_btree_node_read_err_fixable) + if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) bch2_sb_error_count(c, err_type); switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: - ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf); + ret = !silent + ? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf) + : -BCH_ERR_fsck_fix; if (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore) goto fsck_err; @@ -577,14 +589,17 @@ static int __btree_err(int ret, break; case -BCH_ERR_btree_node_read_err_want_retry: case -BCH_ERR_btree_node_read_err_must_retry: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); break; case -BCH_ERR_btree_node_read_err_bad_node: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); ret = bch2_topology_error(c); break; case -BCH_ERR_btree_node_read_err_incompatible: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); ret = -BCH_ERR_fsck_errors_not_fixed; break; default: @@ -619,8 +634,6 @@ fsck_err: __cold void bch2_btree_node_drop_keys_outside_node(struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) { struct bset *i = bset(b, t); struct bkey_packed *k; @@ -1021,18 +1034,19 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, btree_node_bad_seq, - "got wrong btree node (want %llx got %llx)\n" - "got btree %s level %llu pos %s", - bp->seq, b->data->keys.seq, - bch2_btree_id_str(BTREE_NODE_ID(b->data)), - BTREE_NODE_LEVEL(b->data), - buf.buf); + "got wrong btree node: got\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, btree_node_bad_seq, - "bad btree header: seq 0"); + "bad btree header: seq 0\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); } while (b->written < (ptr_written ?: btree_sectors(c))) { @@ -1095,7 +1109,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, nonce = btree_nonce(i, b->written << 9); struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); csum_bad = bch2_crc_cmp(bne->csum, csum); - if (csum_bad) + if (ca && csum_bad) bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); btree_err_on(csum_bad, @@ -1249,12 +1263,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); + rcu_read_lock(); bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - if (ca2->mi.state != BCH_MEMBER_STATE_rw) + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) set_btree_node_need_rewrite(b); } + rcu_read_unlock(); if (!ptr_written) set_btree_node_need_rewrite(b); @@ -1279,8 +1295,8 @@ static void btree_node_read_work(struct work_struct *work) struct btree_read_bio *rb = container_of(work, struct btree_read_bio, work); struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; struct btree *b = rb->b; - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; struct printbuf buf = PRINTBUF; @@ -1292,8 +1308,8 @@ static void btree_node_read_work(struct work_struct *work) while (1) { retry = true; bch_info(c, "retrying read"); - ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); + rb->have_ioref = ca != NULL; bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_buf_bytes(b); @@ -1307,7 +1323,7 @@ static void btree_node_read_work(struct work_struct *work) start: printbuf_reset(&buf); bch2_btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, "btree read error %s for %s", bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) @@ -1363,7 +1379,7 @@ static void btree_node_read_endio(struct bio *bio) struct bch_fs *c = rb->c; if (rb->have_ioref) { - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); } @@ -1560,7 +1576,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct btree_node_read_all *ra = rb->ra; if (rb->have_ioref) { - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); } @@ -1602,14 +1618,14 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool i = 0; bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); struct btree_read_bio *rb = container_of(ra->bio[i], struct btree_read_bio, bio); rb->c = c; rb->b = b; rb->ra = ra; rb->start_time = local_clock(); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->have_ioref = ca != NULL; rb->idx = i; rb->pick = pick; rb->bio.bi_iter.bi_sector = pick.ptr.offset; @@ -1679,7 +1695,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, return; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); bio = bio_alloc_bioset(NULL, buf_pages(b->data, btree_buf_bytes(b)), @@ -1691,7 +1707,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, rb->b = b; rb->ra = NULL; rb->start_time = local_clock(); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->have_ioref = ca != NULL; rb->pick = pick; INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; @@ -1846,7 +1862,6 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; - struct bch_extent_ptr *ptr; int ret = 0; btree_bounce_free(c, @@ -1896,13 +1911,14 @@ static void btree_node_write_endio(struct bio *bio) struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); struct bch_fs *c = wbio->c; struct btree *b = wbio->bio.bi_private; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; unsigned long flags; if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + if (!ca || + bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, "btree write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { @@ -1969,7 +1985,6 @@ static void btree_write_submit(struct work_struct *work) void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) { struct btree_write_bio *wbio; - struct bset_tree *t; struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; @@ -2095,11 +2110,11 @@ do_write: unwritten_whiteouts_end(b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); - b->whiteout_u64s = 0; - - u64s = bch2_sort_keys(i->start, &sort_iter.iter, false); + u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter); le16_add_cpu(&i->u64s, u64s); + b->whiteout_u64s = 0; + BUG_ON(!b->written && i->u64s != b->data->keys.u64s); set_needs_whiteout(i, false); @@ -2226,7 +2241,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) { bool invalidated_iter = false; struct btree_node_entry *bne; - struct bset_tree *t; if (!btree_node_just_written(b)) return false; @@ -2249,7 +2263,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) * single bset: */ if (b->nsets > 1) { - btree_node_sort(c, b, 0, b->nsets, true); + btree_node_sort(c, b, 0, b->nsets); invalidated_iter = true; } else { invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); @@ -2346,20 +2360,13 @@ void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) printbuf_tabstop_push(out, 20); printbuf_tabstop_push(out, 10); - prt_tab(out); - prt_str(out, "nr"); - prt_tab(out); - prt_str(out, "size"); - prt_newline(out); + prt_printf(out, "\tnr\tsize\n"); for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { u64 nr = atomic64_read(&c->btree_write_stats[i].nr); u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); - prt_printf(out, "%s:", bch2_btree_write_types[i]); - prt_tab(out); - prt_u64(out, nr); - prt_tab(out); + prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr); prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); prt_newline(out); } diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index e251cb6b965f..2b8b564fc560 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -81,8 +81,6 @@ static inline bool should_compact_bset_lazy(struct btree *b, static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) if (should_compact_bset_lazy(b, t)) return bch2_compact_whiteouts(c, b, COMPACT_LAZY); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 2a211a4bebd1..5bf98cb8b15d 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -61,7 +61,7 @@ static inline int btree_path_cmp(const struct btree_path *l, static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) { /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_all_snapshots) { p = bpos_successor(p); } else { p = bpos_nosnap_successor(p); @@ -74,7 +74,7 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) { /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_all_snapshots) { p = bpos_predecessor(p); } else { p = bpos_nosnap_predecessor(p); @@ -88,7 +88,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) { struct bpos pos = iter->pos; - if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_is_extents) && !bkey_eq(pos, POS_MAX)) pos = bkey_successor(iter, pos); return pos; @@ -253,13 +253,13 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(iter->btree_id >= BTREE_ID_NR); - BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached); + BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); - BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && - (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + BUG_ON((iter->flags & BTREE_ITER_is_extents) && + (iter->flags & BTREE_ITER_all_snapshots)); - BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && - (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) && + (iter->flags & BTREE_ITER_all_snapshots) && !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) @@ -269,10 +269,10 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { - BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && !iter->pos.snapshot); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && iter->pos.snapshot != iter->snapshot); BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || @@ -289,7 +289,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k if (!bch2_debug_check_iterators) return 0; - if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_filter_snapshots)) return 0; if (bkey_err(k) || !k.k) @@ -300,8 +300,8 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k k.k->p.snapshot)); bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, - BTREE_ITER_NOPRESERVE| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_nopreserve| + BTREE_ITER_all_snapshots); prev = bch2_btree_iter_prev(©); if (!prev.k) goto out; @@ -897,7 +897,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, bch2_bkey_buf_reassemble(out, c, k); - if ((flags & BTREE_ITER_PREFETCH) && + if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); @@ -944,7 +944,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, bch2_bkey_buf_unpack(&tmp, c, l->b, k); - if ((flags & BTREE_ITER_PREFETCH) && + if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) @@ -999,6 +999,7 @@ retry_all: bch2_trans_unlock(trans); cond_resched(); + trans->locked = true; if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -1162,6 +1163,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out_uptodate; path->level = btree_path_up_until_good_node(trans, path, 0); + unsigned max_level = path->level; EBUG_ON(btree_path_node(path, path->level) && !btree_node_locked(path, path->level)); @@ -1192,6 +1194,16 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } } + + if (unlikely(max_level > path->level)) { + struct btree_path *linked; + unsigned iter; + + trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter) + for (unsigned j = path->level + 1; j < max_level; j++) + linked->l[j] = path->l[j]; + } + out_uptodate: path->uptodate = BTREE_ITER_UPTODATE; out: @@ -1221,11 +1233,14 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path } static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, - bool intent) + bool intent, unsigned long ip) { btree_path_idx_t new = btree_path_alloc(trans, src); btree_path_copy(trans, trans->paths + new, trans->paths + src); __btree_path_get(trans->paths + new, intent); +#ifdef TRACK_PATH_ALLOCATED + trans->paths[new].ip_allocated = ip; +#endif return new; } @@ -1234,7 +1249,7 @@ btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, btree_path_idx_t path, bool intent, unsigned long ip) { __btree_path_put(trans->paths + path, intent); - path = btree_path_clone(trans, path, intent); + path = btree_path_clone(trans, path, intent, ip); trans->paths[path].preserve = false; return path; } @@ -1334,6 +1349,26 @@ static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t __clear_bit(path, trans->paths_allocated); } +static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path) +{ + unsigned l = path->level; + + do { + if (!btree_path_node(path, l)) + break; + + if (!is_btree_node(path, l)) + return false; + + if (path->l[l].lock_seq != path->l[l].b->c.lock.seq) + return false; + + l++; + } while (l < path->locks_want); + + return true; +} + void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { struct btree_path *path = trans->paths + path_idx, *dup; @@ -1348,10 +1383,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) return; - if (path->should_be_locked && - !trans->restarted && - (!dup || !bch2_btree_path_relock_norestart(trans, dup))) - return; + if (path->should_be_locked && !trans->restarted) { + if (!dup) + return; + + if (!(trans->locked + ? bch2_btree_path_relock_norestart(trans, dup) + : bch2_btree_path_can_relock(trans, dup))) + return; + } if (dup) { dup->preserve |= path->preserve; @@ -1384,22 +1424,26 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) (void *) trans->last_restarted_ip); } +void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) +{ + panic("trans should be locked, unlocked by %pS\n", + (void *) trans->last_unlock_ip); +} + noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { - prt_printf(buf, "transaction updates for %s journal seq %llu", + prt_printf(buf, "transaction updates for %s journal seq %llu\n", trans->fn, trans->journal_res.seq); - prt_newline(buf); printbuf_indent_add(buf, 2); trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; - prt_printf(buf, "update: btree=%s cached=%u %pS", + prt_printf(buf, "update: btree=%s cached=%u %pS\n", bch2_btree_id_str(i->btree_id), i->cached, (void *) i->ip_allocated); - prt_newline(buf); prt_printf(buf, " old "); bch2_bkey_val_to_text(buf, trans->c, old); @@ -1428,23 +1472,63 @@ void bch2_dump_trans_updates(struct btree_trans *trans) printbuf_exit(&buf); } -static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) +static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { struct btree_path *path = trans->paths + path_idx; - prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", + prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ", path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', + path->cached ? 'C' : 'B', bch2_btree_id_str(path->btree_id), path->level); bch2_bpos_to_text(out, path->pos); - prt_printf(out, " locks %u", path->nodes_locked); #ifdef TRACK_PATH_ALLOCATED prt_printf(out, " %pS", (void *) path->ip_allocated); #endif +} + +static const char *btree_node_locked_str(enum btree_node_locked_type t) +{ + switch (t) { + case BTREE_NODE_UNLOCKED: + return "unlocked"; + case BTREE_NODE_READ_LOCKED: + return "read"; + case BTREE_NODE_INTENT_LOCKED: + return "intent"; + case BTREE_NODE_WRITE_LOCKED: + return "write"; + default: + return NULL; + } +} + +void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) +{ + bch2_btree_path_to_text_short(out, trans, path_idx); + + struct btree_path *path = trans->paths + path_idx; + + prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want); prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { + prt_printf(out, "l=%u locks %s seq %u node ", l, + btree_node_locked_str(btree_node_locked_type(path, l)), + path->l[l].lock_seq); + + int ret = PTR_ERR_OR_ZERO(path->l[l].b); + if (ret) + prt_str(out, bch2_err_str(ret)); + else + prt_printf(out, "%px", path->l[l].b); + prt_newline(out); + } + printbuf_indent_sub(out, 2); } static noinline __cold @@ -1456,8 +1540,10 @@ void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, if (!nosort) btree_trans_sort_paths(trans); - trans_for_each_path_idx_inorder(trans, iter) - bch2_btree_path_to_text(out, trans, iter.path_idx); + trans_for_each_path_idx_inorder(trans, iter) { + bch2_btree_path_to_text_short(out, trans, iter.path_idx); + prt_newline(out); + } } noinline __cold @@ -1608,11 +1694,12 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, unsigned flags, unsigned long ip) { struct btree_path *path; - bool cached = flags & BTREE_ITER_CACHED; - bool intent = flags & BTREE_ITER_INTENT; + bool cached = flags & BTREE_ITER_cached; + bool intent = flags & BTREE_ITER_intent; struct trans_for_each_path_inorder_iter iter; btree_path_idx_t path_pos = 0, path_idx; + bch2_trans_verify_not_unlocked(trans); bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_locks(trans); @@ -1657,7 +1744,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, trans->paths_sorted = false; } - if (!(flags & BTREE_ITER_NOPRESERVE)) + if (!(flags & BTREE_ITER_nopreserve)) path->preserve = true; if (path->intent_ref) @@ -1678,6 +1765,22 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, return path_idx; } +btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) +{ + btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, + BTREE_ITER_nopreserve| + BTREE_ITER_intent, _RET_IP_); + path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); + + struct btree_path *path = trans->paths + path_idx; + bch2_btree_path_downgrade(trans, path); + __bch2_btree_path_unlock(trans, path); + return path_idx; +} + struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) { @@ -1719,6 +1822,19 @@ hole: return (struct bkey_s_c) { u, NULL }; } + +void bch2_set_btree_iter_dontneed(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + + if (!iter->path || trans->restarted) + return; + + struct btree_path *path = btree_iter_path(trans, iter); + path->preserve = false; + if (path->ref == 1) + path->should_be_locked = false; +} /* Btree iterators: */ int __must_check @@ -1733,9 +1849,11 @@ bch2_btree_iter_traverse(struct btree_iter *iter) struct btree_trans *trans = iter->trans; int ret; + bch2_trans_verify_not_unlocked(trans); + iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); @@ -1774,7 +1892,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out: @@ -1835,13 +1953,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) if (bpos_eq(iter->pos, b->key.k.p)) { __btree_path_set_level_up(trans, path, path->level++); } else { + if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED) + btree_node_unlock(trans, path, path->level + 1); + /* * Haven't gotten to the end of the parent node: go back down to * the next child node */ iter->path = bch2_btree_path_set_pos(trans, iter->path, bpos_successor(iter->pos), - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); path = btree_iter_path(trans, iter); @@ -1859,7 +1980,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); btree_path_set_should_be_locked(btree_iter_path(trans, iter)); EBUG_ON(btree_iter_path(trans, iter)->uptodate); @@ -1878,11 +1999,11 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + bool ret = !(iter->flags & BTREE_ITER_all_snapshots ? bpos_eq(pos, SPOS_MAX) : bkey_eq(pos, SPOS_MAX)); - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_successor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; @@ -1891,11 +2012,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + bool ret = !(iter->flags & BTREE_ITER_all_snapshots ? bpos_eq(pos, POS_MIN) : bkey_eq(pos, POS_MIN)); - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_predecessor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; @@ -2006,7 +2127,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos struct bkey_s_c k; int ret; - if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && + bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked(trans); + + if ((iter->flags & BTREE_ITER_key_cache_fill) && bpos_eq(iter->pos, pos)) return bkey_s_c_null; @@ -2015,17 +2139,17 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, - iter->flags & BTREE_ITER_INTENT, 0, - iter->flags|BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL, + iter->flags & BTREE_ITER_intent, 0, + iter->flags|BTREE_ITER_cached| + BTREE_ITER_cached_nofill, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - iter->flags|BTREE_ITER_CACHED) ?: + iter->flags|BTREE_ITER_cached) ?: bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); @@ -2053,7 +2177,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp struct btree_path_level *l; iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2078,7 +2202,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp k = btree_path_level_peek_all(trans->c, l, &iter->k); - if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; @@ -2089,10 +2213,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp } } - if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + if (unlikely(iter->flags & BTREE_ITER_with_journal)) k = btree_trans_peek_journal(trans, iter, k); - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_updates(trans, iter, &k); @@ -2144,11 +2268,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; - EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); + bch2_trans_verify_not_unlocked(trans); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } @@ -2171,7 +2296,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * isn't monotonically increasing before FILTER_SNAPSHOTS, and * that's what we check against in extents mode: */ - if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) ? bkey_gt(k.k->p, end) : k.k->p.inode > end.inode)) goto end; @@ -2179,13 +2304,13 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e if (iter->update_path && !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && - (iter->flags & BTREE_ITER_INTENT) && - !(iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_filter_snapshots) && + (iter->flags & BTREE_ITER_intent) && + !(iter->flags & BTREE_ITER_is_extents) && !iter->update_path) { struct bpos pos = k.k->p; @@ -2200,12 +2325,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * advance, same as on exit for iter->path, but only up * to snapshot */ - __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent); iter->update_path = iter->path; iter->update_path = bch2_btree_path_set_pos(trans, iter->update_path, pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); if (unlikely(ret)) { @@ -2218,7 +2343,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * We can never have a key in a leaf node at POS_MAX, so * we don't have to check these successor() calls: */ - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + if ((iter->flags & BTREE_ITER_filter_snapshots) && !bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { @@ -2227,7 +2352,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e } if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + !(iter->flags & BTREE_ITER_all_snapshots)) { search_key = bkey_successor(iter, k.k->p); continue; } @@ -2237,12 +2362,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * equal to the key we just returned - except extents can * straddle iter->pos: */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (!(iter->flags & BTREE_ITER_is_extents)) iter_pos = k.k->p; else iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) ? bkey_gt(iter_pos, end) : bkey_ge(iter_pos, end))) goto end; @@ -2253,7 +2378,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->pos = iter_pos; iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); btree_path_set_should_be_locked(btree_iter_path(trans, iter)); @@ -2266,7 +2391,7 @@ out_no_locked: btree_path_set_should_be_locked(trans->paths + iter->update_path); } - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_all_snapshots)) iter->pos.snapshot = iter->snapshot; ret = bch2_btree_iter_verify_ret(iter, k); @@ -2316,21 +2441,22 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) btree_path_idx_t saved_path = 0; int ret; + bch2_trans_verify_not_unlocked(trans); EBUG_ON(btree_iter_path(trans, iter)->cached || btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_WITH_JOURNAL) + if (iter->flags & BTREE_ITER_with_journal) return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) search_key.snapshot = U32_MAX; while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2345,17 +2471,17 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); if (!k.k || - ((iter->flags & BTREE_ITER_IS_EXTENTS) + ((iter->flags & BTREE_ITER_is_extents) ? bpos_ge(bkey_start_pos(k.k), search_key) : bpos_gt(k.k->p, search_key))) k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_prev_updates(trans, iter, &k); if (likely(k.k)) { - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_filter_snapshots) { if (k.k->p.snapshot == iter->snapshot) goto got_key; @@ -2366,7 +2492,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) */ if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { bch2_path_put_nokeep(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->path = saved_path; saved_path = 0; iter->k = saved_k; @@ -2379,9 +2505,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) k.k->p.snapshot)) { if (saved_path) bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); saved_path = btree_path_clone(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent, + _THIS_IP_); path = btree_iter_path(trans, iter); saved_k = *k.k; saved_v = k.v; @@ -2392,9 +2519,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) } got_key: if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + !(iter->flags & BTREE_ITER_all_snapshots)) { search_key = bkey_predecessor(iter, k.k->p); - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) search_key.snapshot = U32_MAX; continue; } @@ -2418,11 +2545,11 @@ got_key: if (bkey_lt(k.k->p, iter->pos)) iter->pos = k.k->p; - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) iter->pos.snapshot = iter->snapshot; out_no_locked: if (saved_path) - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2452,12 +2579,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; + bch2_trans_verify_not_unlocked(trans); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); /* extents can't span inode numbers: */ - if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; @@ -2467,7 +2595,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) search_key = btree_iter_search_key(iter); iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2476,22 +2604,22 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } - if ((iter->flags & BTREE_ITER_CACHED) || - !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + if ((iter->flags & BTREE_ITER_cached) || + !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { k = bkey_s_c_null; - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) { bch2_btree_trans_peek_slot_updates(trans, iter, &k); if (k.k) goto out; } - if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + if (unlikely(iter->flags & BTREE_ITER_with_journal) && (k = btree_trans_peek_slot_journal(trans, iter)).k) goto out; - if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; @@ -2506,12 +2634,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bpos next; struct bpos end = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) + if (iter->flags & BTREE_ITER_is_extents) end.offset = U64_MAX; EBUG_ON(btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_INTENT) { + if (iter->flags & BTREE_ITER_intent) { struct btree_iter iter2; bch2_trans_copy_iter(&iter2, iter); @@ -2542,7 +2670,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bkey_init(&iter->k); iter->k.p = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) { + if (iter->flags & BTREE_ITER_is_extents) { bch2_key_resize(&iter->k, min_t(u64, KEY_SIZE_MAX, (next.inode == iter->pos.inode @@ -2726,13 +2854,13 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { if (iter->update_path) bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); if (iter->path) bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->path = 0; iter->update_path = 0; iter->key_cache_path = 0; @@ -2757,9 +2885,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, unsigned depth, unsigned flags) { - flags |= BTREE_ITER_NOT_EXTENTS; - flags |= __BTREE_ITER_ALL_SNAPSHOTS; - flags |= BTREE_ITER_ALL_SNAPSHOTS; + flags |= BTREE_ITER_not_extents; + flags |= BTREE_ITER_snapshot_field; + flags |= BTREE_ITER_all_snapshots; bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, __bch2_btree_iter_flags(trans, btree_id, flags), @@ -2782,9 +2910,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->ip_allocated = _RET_IP_; #endif if (src->path) - __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent); if (src->update_path) - __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent); dst->key_cache_path = 0; } @@ -2953,7 +3081,8 @@ u32 bch2_trans_begin(struct btree_trans *trans) if (!trans->restarted && (need_resched() || time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { - drop_locks_do(trans, (cond_resched(), 0)); + bch2_trans_unlock(trans); + cond_resched(); now = local_clock(); } trans->last_begin_time = now; @@ -2963,11 +3092,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); trans->last_begin_ip = _RET_IP_; + trans->locked = true; + if (trans->restarted) { bch2_btree_path_traverse_all(trans); trans->notrace_relock_fail = false; } + bch2_trans_verify_not_unlocked(trans); return trans->restart_count; } @@ -3020,7 +3152,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) */ BUG_ON(pos_task && pid == pos_task->pid && - bch2_trans_locked(pos)); + pos->locked); if (pos_task && pid < pos_task->pid) { list_add_tail(&trans->list, &pos->list); @@ -3036,8 +3168,9 @@ got_trans: trans->last_begin_time = local_clock(); trans->fn_idx = fn_idx; trans->locking_wait.task = current; + trans->locked = true; trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) && atomic_inc_not_zero(&c->journal_keys.ref); trans->nr_paths = ARRAY_SIZE(trans->_paths); trans->paths_allocated = trans->_paths_allocated; @@ -3166,13 +3299,11 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, pid = owner ? owner->pid : 0; rcu_read_unlock(); - prt_tab(out); - prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', + prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b', b->level, bch2_btree_id_str(b->btree_id)); bch2_bpos_to_text(out, btree_node_pos(b)); - prt_tab(out); - prt_printf(out, " locks %u:%u:%u held by pid %u", + prt_printf(out, "\t locks %u:%u:%u held by pid %u", c.n[0], c.n[1], c.n[2], pid); } @@ -3229,10 +3360,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) b = READ_ONCE(trans->locking); if (b) { - prt_printf(out, " blocked for %lluus on", - div_u64(local_clock() - trans->locking_wait.start_time, - 1000)); - prt_newline(out); + prt_printf(out, " blocked for %lluus on\n", + div_u64(local_clock() - trans->locking_wait.start_time, 1000)); prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); bch2_btree_bkey_cached_common_to_text(out, b); prt_newline(out); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 1c70836dd7cc..eab2a25bdc7a 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -216,9 +216,13 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *, btree_path_idx_t, unsigned, unsigned long); +static inline void bch2_trans_verify_not_unlocked(struct btree_trans *); + static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) { + bch2_trans_verify_not_unlocked(trans); + if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; @@ -227,6 +231,9 @@ static inline int __must_check bch2_btree_path_traverse(struct btree_trans *tran btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned, unsigned long); +btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id, + unsigned, struct bpos); + struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); /* @@ -283,7 +290,6 @@ int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); void bch2_trans_unlock_long(struct btree_trans *); -bool bch2_trans_locked(struct btree_trans *); static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) { @@ -309,6 +315,14 @@ static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) bch2_trans_in_restart_error(trans); } +void __noreturn bch2_trans_unlocked_error(struct btree_trans *); + +static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans) +{ + if (!trans->locked) + bch2_trans_unlocked_error(trans); +} + __always_inline static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) { @@ -386,10 +400,10 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos if (unlikely(iter->update_path)) bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_all_snapshots)) new_pos.snapshot = iter->snapshot; __bch2_btree_iter_set_pos(iter, new_pos); @@ -397,7 +411,7 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) { - BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); + BUG_ON(!(iter->flags & BTREE_ITER_is_extents)); iter->pos = bkey_start_pos(&iter->k); } @@ -416,20 +430,20 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, unsigned btree_id, unsigned flags) { - if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && + if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && btree_id_is_extents(btree_id)) - flags |= BTREE_ITER_IS_EXTENTS; + flags |= BTREE_ITER_is_extents; - if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && + if (!(flags & BTREE_ITER_snapshot_field) && !btree_type_has_snapshot_field(btree_id)) - flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + flags &= ~BTREE_ITER_all_snapshots; - if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + if (!(flags & BTREE_ITER_all_snapshots) && btree_type_has_snapshots(btree_id)) - flags |= BTREE_ITER_FILTER_SNAPSHOTS; + flags |= BTREE_ITER_filter_snapshots; if (trans->journal_replay_not_finished) - flags |= BTREE_ITER_WITH_JOURNAL; + flags |= BTREE_ITER_with_journal; return flags; } @@ -439,10 +453,10 @@ static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, unsigned flags) { if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_CACHED; - flags &= ~BTREE_ITER_WITH_KEY_CACHE; - } else if (!(flags & BTREE_ITER_CACHED)) - flags |= BTREE_ITER_WITH_KEY_CACHE; + flags &= ~BTREE_ITER_cached; + flags &= ~BTREE_ITER_with_key_cache; + } else if (!(flags & BTREE_ITER_cached)) + flags |= BTREE_ITER_with_key_cache; return __bch2_btree_iter_flags(trans, btree_id, flags); } @@ -494,18 +508,7 @@ void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, unsigned, unsigned, unsigned); void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); -static inline void set_btree_iter_dontneed(struct btree_iter *iter) -{ - struct btree_trans *trans = iter->trans; - - if (!iter->path || trans->restarted) - return; - - struct btree_path *path = btree_iter_path(trans, iter); - path->preserve = false; - if (path->ref == 1) - path->should_be_locked = false; -} +void bch2_set_btree_iter_dontneed(struct btree_iter *); void *__bch2_trans_kmalloc(struct btree_trans *, size_t); @@ -619,14 +622,14 @@ u32 bch2_trans_begin(struct btree_trans *); static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek_prev(iter); } static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } @@ -634,7 +637,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter * struct bpos end, unsigned flags) { - if (!(flags & BTREE_ITER_SLOTS)) + if (!(flags & BTREE_ITER_slots)) return bch2_btree_iter_peek_upto(iter, end); if (bkey_gt(iter->pos, end)) @@ -699,16 +702,12 @@ transaction_restart: \ _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) -#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ +#define for_each_btree_key_upto_continue(_trans, _iter, \ + _end, _flags, _k, _do) \ ({ \ - struct btree_iter _iter; \ struct bkey_s_c _k; \ int _ret3 = 0; \ \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ do { \ _ret3 = lockrestart_do(_trans, ({ \ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ @@ -724,6 +723,21 @@ transaction_restart: \ _ret3; \ }) +#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ + for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) + +#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ +({ \ + bch2_trans_begin(trans); \ + \ + struct btree_iter _iter; \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\ +}) + #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ @@ -794,14 +808,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, return k; } -#define for_each_btree_key_old(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - #define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ @@ -861,6 +867,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, }) void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); +void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 1e8cf49a6935..332dbf164929 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -623,3 +623,20 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, keys->data[dst++] = *i; keys->nr = keys->gap = dst; } + +void bch2_journal_keys_dump(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct printbuf buf = PRINTBUF; + + pr_info("%zu keys:", keys->nr); + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); + } + printbuf_exit(&buf); +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index af25046ebcaa..1ba4a79b0ef9 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -70,4 +70,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, unsigned, unsigned, struct bpos, struct bpos); +void bch2_journal_keys_dump(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 7dafa1accec2..75f5e6fe4634 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -383,9 +383,9 @@ static int btree_key_cache_fill(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, - BTREE_ITER_KEY_CACHE_FILL| - BTREE_ITER_CACHED_NOFILL); - iter.flags &= ~BTREE_ITER_WITH_JOURNAL; + BTREE_ITER_key_cache_fill| + BTREE_ITER_cached_nofill); + iter.flags &= ~BTREE_ITER_with_journal; k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -456,7 +456,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -515,23 +515,10 @@ retry: fill: path->uptodate = BTREE_ITER_UPTODATE; - if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { - /* - * Using the underscore version because we haven't set - * path->uptodate yet: - */ - if (!path->locks_want && - !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { - trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); - goto err; - } - - ret = btree_key_cache_fill(trans, path, ck); - if (ret) - goto err; - - ret = bch2_btree_path_relock(trans, path, _THIS_IP_); + if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) { + ret = bch2_btree_path_upgrade(trans, path, 1) ?: + btree_key_cache_fill(trans, path, ck) ?: + bch2_btree_path_relock(trans, path, _THIS_IP_); if (ret) goto err; @@ -622,13 +609,13 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, - BTREE_ITER_SLOTS| - BTREE_ITER_INTENT| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_slots| + BTREE_ITER_intent| + BTREE_ITER_all_snapshots); bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + BTREE_ITER_cached| + BTREE_ITER_intent); + b_iter.flags &= ~BTREE_ITER_with_key_cache; ret = bch2_btree_iter_traverse(&c_iter); if (ret) @@ -661,14 +648,14 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, commit_flags |= BCH_WATERMARK_reclaim; if (ck->journal.seq != journal_last_seq(j) || - !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) + !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_KEY_CACHE_RECLAIM| - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - BTREE_TRIGGER_NORUN) ?: + BTREE_UPDATE_key_cache_reclaim| + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| @@ -790,7 +777,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, * flushing. The flush callback will not proceed unless ->seq matches * the latest pin, so make sure it starts with a consistent value. */ - if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || !journal_pin_active(&ck->journal)) { ck->seq = trans->journal_res.seq; } @@ -835,6 +822,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, int srcu_idx; mutex_lock(&bc->lock); + bc->requested_to_free += sc->nr_to_scan; + srcu_idx = srcu_read_lock(&c->btree_trans_barrier); flags = memalloc_nofs_save(); @@ -853,6 +842,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, atomic_long_dec(&bc->nr_freed); freed++; bc->nr_freed_nonpcpu--; + bc->freed++; } list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { @@ -866,6 +856,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, atomic_long_dec(&bc->nr_freed); freed++; bc->nr_freed_pcpu--; + bc->freed++; } rcu_read_lock(); @@ -884,13 +875,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, ck = container_of(pos, struct bkey_cached, hash); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + bc->skipped_dirty++; goto next; } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); + bc->skipped_accessed++; goto next; } else if (bkey_cached_lock_for_evict(ck)) { bkey_cached_evict(bc, ck); bkey_cached_free(bc, ck); + bc->moved_to_freelist++; + } else { + bc->skipped_lock_fail++; } scanned++; @@ -1037,14 +1033,47 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) return 0; } -void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) { - prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); - prt_newline(out); - prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); - prt_newline(out); - prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); - prt_newline(out); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + printbuf_tabstop_push(out, 24); + printbuf_tabstop_push(out, 12); + + unsigned flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); + prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); + prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed)); + prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu); + prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu); + + prt_printf(out, "\nshrinker:\n"); + prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); + prt_printf(out, "freed:\t%lu\r\n", bc->freed); + prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist); + prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); + prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); + prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); + + prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier)); + + struct bkey_cached *ck; + unsigned iter = 0; + list_for_each_entry(ck, &bc->freed_nonpcpu, list) { + prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); + if (++iter > 10) + break; + } + + iter = 0; + list_for_each_entry(ck, &bc->freed_pcpu, list) { + prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); + if (++iter > 10) + break; + } + mutex_unlock(&bc->lock); + memalloc_flags_restore(flags); } void bch2_btree_key_cache_exit(void) diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h index 290e4e57df5b..237e8bb3ac40 100644 --- a/fs/bcachefs/btree_key_cache_types.h +++ b/fs/bcachefs/btree_key_cache_types.h @@ -24,6 +24,14 @@ struct btree_key_cache { atomic_long_t nr_freed; atomic_long_t nr_keys; atomic_long_t nr_dirty; + + /* shrinker stats */ + unsigned long requested_to_free; + unsigned long freed; + unsigned long moved_to_freelist; + unsigned long skipped_dirty; + unsigned long skipped_accessed; + unsigned long skipped_lock_fail; }; struct bkey_cached_key { diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index f2caf491957e..c3e9b0cc7bbd 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -83,8 +83,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) { struct trans_waiting_for_lock *i; - prt_printf(out, "Found lock cycle (%u entries):", g->nr); - prt_newline(out); + prt_printf(out, "Found lock cycle (%u entries):\n", g->nr); for (i = g->g; i < g->g + g->nr; i++) { struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); @@ -224,8 +223,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) bch2_btree_trans_to_text(&buf, trans); - prt_printf(&buf, "backtrace:"); - prt_newline(&buf); + prt_printf(&buf, "backtrace:\n"); printbuf_indent_add(&buf, 2); bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); @@ -492,8 +490,6 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (path->uptodate == BTREE_ITER_NEED_RELOCK) path->uptodate = BTREE_ITER_UPTODATE; - bch2_trans_verify_locks(trans); - return path->uptodate < BTREE_ITER_NEED_RELOCK; } @@ -609,7 +605,9 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_pa { struct get_locks_fail f; - return btree_path_get_locks(trans, path, false, &f); + bool ret = btree_path_get_locks(trans, path, false, &f); + bch2_trans_verify_locks(trans); + return ret; } int __bch2_btree_path_relock(struct btree_trans *trans, @@ -632,7 +630,9 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, path->locks_want = new_locks_want; - return btree_path_get_locks(trans, path, true, f); + bool ret = btree_path_get_locks(trans, path, true, f); + bch2_trans_verify_locks(trans); + return ret; } bool __bch2_btree_path_upgrade(struct btree_trans *trans, @@ -640,8 +640,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, unsigned new_locks_want, struct get_locks_fail *f) { - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) - return true; + bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); + if (ret) + goto out; /* * XXX: this is ugly - we'd prefer to not be mucking with other @@ -675,8 +676,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, btree_path_get_locks(trans, linked, true, NULL); } } - - return false; +out: + bch2_trans_verify_locks(trans); + return ret; } void __bch2_btree_path_downgrade(struct btree_trans *trans, @@ -725,82 +727,100 @@ void bch2_trans_downgrade(struct btree_trans *trans) bch2_btree_path_downgrade(trans, path); } -int bch2_trans_relock(struct btree_trans *trans) +static inline void __bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; unsigned i; - if (unlikely(trans->restarted)) - return -((int) trans->restarted); + trans_for_each_path(trans, path, i) + __bch2_btree_path_unlock(trans, path); +} - trans_for_each_path(trans, path, i) { - struct get_locks_fail f; +static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, + struct get_locks_fail *f, bool trace) +{ + if (!trace) + goto out; - if (path->should_be_locked && - !btree_path_get_locks(trans, path, false, &f)) { - if (trace_trans_restart_relock_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " l=%u seq=%u node seq=", - f.l, path->l[f.l].lock_seq); - if (IS_ERR_OR_NULL(f.b)) { - prt_str(&buf, bch2_err_str(PTR_ERR(f.b))); - } else { - prt_printf(&buf, "%u", f.b->c.lock.seq); - - struct six_lock_count c = - bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l); - prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - - c = six_lock_counts(&f.b->c.lock); - prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - } + if (trace_trans_restart_relock_enabled()) { + struct printbuf buf = PRINTBUF; - trace_trans_restart_relock(trans, _RET_IP_, buf.buf); - printbuf_exit(&buf); - } + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); + if (IS_ERR_OR_NULL(f->b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); + } else { + prt_printf(&buf, "%u", f->b->c.lock.seq); - count_event(trans->c, trans_restart_relock); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + struct six_lock_count c = + bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l); + prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + + c = six_lock_counts(&f->b->c.lock); + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); } + + trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); } - return 0; + count_event(trans->c, trans_restart_relock); +out: + __bch2_trans_unlock(trans); + bch2_trans_verify_locks(trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } -int bch2_trans_relock_notrace(struct btree_trans *trans) +static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) { - struct btree_path *path; - unsigned i; + bch2_trans_verify_locks(trans); if (unlikely(trans->restarted)) return -((int) trans->restarted); + if (unlikely(trans->locked)) + goto out; + + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; - trans_for_each_path(trans, path, i) if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path)) { - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); - } + !btree_path_get_locks(trans, path, false, &f)) + return bch2_trans_relock_fail(trans, path, &f, trace); + } + + trans->locked = true; +out: + bch2_trans_verify_locks(trans); return 0; } +int bch2_trans_relock(struct btree_trans *trans) +{ + return __bch2_trans_relock(trans, true); +} + +int bch2_trans_relock_notrace(struct btree_trans *trans) +{ + return __bch2_trans_relock(trans, false); +} + void bch2_trans_unlock_noassert(struct btree_trans *trans) { - struct btree_path *path; - unsigned i; + __bch2_trans_unlock(trans); - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); + trans->locked = false; + trans->last_unlock_ip = _RET_IP_; } void bch2_trans_unlock(struct btree_trans *trans) { - struct btree_path *path; - unsigned i; + __bch2_trans_unlock(trans); - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); + trans->locked = false; + trans->last_unlock_ip = _RET_IP_; } void bch2_trans_unlock_long(struct btree_trans *trans) @@ -809,17 +829,6 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); } -bool bch2_trans_locked(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->nodes_locked) - return true; - return false; -} - int __bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) { @@ -836,15 +845,19 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, void bch2_btree_path_verify_locks(struct btree_path *path) { - unsigned l; + /* + * A path may be uptodate and yet have nothing locked if and only if + * there is no node at path->level, which generally means we were + * iterating over all nodes and got to the end of the btree + */ + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level) && + !path->nodes_locked); - if (!path->nodes_locked) { - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level)); + if (!path->nodes_locked) return; - } - for (l = 0; l < BTREE_MAX_DEPTH; l++) { + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { int want = btree_lock_want(path, l); int have = btree_node_locked_type(path, l); @@ -857,8 +870,24 @@ void bch2_btree_path_verify_locks(struct btree_path *path) } } +static bool bch2_trans_locked(struct btree_trans *trans) +{ + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) + if (path->nodes_locked) + return true; + return false; +} + void bch2_trans_verify_locks(struct btree_trans *trans) { + if (!trans->locked) { + BUG_ON(bch2_trans_locked(trans)); + return; + } + struct btree_path *path; unsigned i; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 4bd72c855da1..7f41545b9147 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -364,14 +364,14 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - struct get_locks_fail f; + struct get_locks_fail f = {}; unsigned old_locks_want = path->locks_want; new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); if (path->locks_want < new_locks_want ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) - : path->uptodate == BTREE_ITER_UPTODATE) + : path->nodes_locked) return 0; trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index bbec91e8e650..74e1ff225674 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_foreground.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" @@ -19,6 +20,26 @@ #include <linux/prefetch.h> +static const char * const trans_commit_flags_strs[] = { +#define x(n, ...) #n, + BCH_TRANS_COMMIT_FLAGS() +#undef x + NULL +}; + +void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags) +{ + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + + prt_printf(out, "watermark=%s", bch2_watermarks[watermark]); + + flags >>= BCH_WATERMARK_BITS; + if (flags) { + prt_char(out, ' '); + bch2_prt_bitflags(out, trans_commit_flags_strs, flags); + } +} + static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -315,8 +336,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->btree_id != path->btree_id); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && - !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && - test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + !(i->flags & BTREE_UPDATE_internal_snapshot_node) && + test_bit(JOURNAL_replay_done, &trans->c->journal.flags) && i->k->k.p.snapshot && bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); } @@ -443,13 +464,13 @@ static int run_one_mem_trigger(struct btree_trans *trans, verify_update_old_key(trans, i); - if (unlikely(flags & BTREE_TRIGGER_NORUN)) + if (unlikely(flags & BTREE_TRIGGER_norun)) return 0; if (old_ops->trigger == new_ops->trigger) { ret = bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags); } else { ret = bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(new), flags) ?: @@ -472,11 +493,11 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ struct bkey_s_c old = { &old_k, i->old_v }; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL; + unsigned flags = i->flags|BTREE_TRIGGER_transactional; verify_update_old_key(trans, i); - if ((i->flags & BTREE_TRIGGER_NORUN) || + if ((i->flags & BTREE_TRIGGER_norun) || !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) return 0; @@ -486,8 +507,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ i->overwrite_trigger_run = true; i->insert_trigger_run = true; return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE|flags) ?: 1; + BTREE_TRIGGER_insert| + BTREE_TRIGGER_overwrite|flags) ?: 1; } else if (overwrite && !i->overwrite_trigger_run) { i->overwrite_trigger_run = true; return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; @@ -572,7 +593,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && (!i->insert_trigger_run || !i->overwrite_trigger_run)); #endif @@ -590,7 +611,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { - int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); if (ret) return ret; } @@ -609,6 +630,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, unsigned u64s = 0; int ret; + bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_in_restart(trans); + if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); @@ -686,7 +710,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_update(trans, i) if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); if (ret) goto fatal_err; } @@ -705,7 +729,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; verify_update_old_key(trans, i); @@ -766,16 +790,15 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans } static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct btree_insert_entry *i, struct printbuf *err) { struct bch_fs *c = trans->c; printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps", + prt_printf(err, "invalid bkey on insert from %s -> %ps\n", trans->fn, (void *) i->ip_allocated); - prt_newline(err); printbuf_indent_add(err, 2); bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); @@ -796,8 +819,7 @@ static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans * struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - prt_printf(&buf, "invalid bkey on insert from %s", trans->fn); - prt_newline(&buf); + prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn); printbuf_indent_add(&buf, 2); bch2_journal_entry_to_text(&buf, c, i); @@ -988,6 +1010,9 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret = 0; + bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_in_restart(trans); + if (!trans->nr_updates && !trans->journal_entries_u64s) goto out_reset; @@ -1000,10 +1025,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trans_for_each_update(trans, i) { struct printbuf buf = PRINTBUF; - enum bkey_invalid_flags invalid_flags = 0; + enum bch_validate_flags invalid_flags = 0; if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, invalid_flags, &buf))) @@ -1018,10 +1043,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) for (struct jset_entry *i = trans->journal_entries; i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); i = vstruct_next(i)) { - enum bkey_invalid_flags invalid_flags = 0; + enum bch_validate_flags invalid_flags = 0; if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; if (unlikely(bch2_journal_entry_validate(c, NULL, i, bcachefs_metadata_version_current, @@ -1065,7 +1090,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; /* we're going to journal the key being updated: */ @@ -1086,6 +1111,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) } retry: errored_at = NULL; + bch2_trans_verify_not_unlocked(trans); bch2_trans_verify_not_in_restart(trans); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index c69b233c41bb..d63db4fefe73 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -163,9 +163,21 @@ struct btree_cache { /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + unsigned freed; + unsigned not_freed_lock_intent; + unsigned not_freed_lock_write; + unsigned not_freed_dirty; + unsigned not_freed_read_in_flight; + unsigned not_freed_write_in_flight; + unsigned not_freed_noevict; + unsigned not_freed_write_blocked; + unsigned not_freed_will_make_reachable; + unsigned not_freed_access_bit; atomic_t dirty; struct shrinker *shrink; + unsigned used_by_btree[BTREE_ID_NR]; + /* * If we need to allocate memory for a new btree node and that * allocation fails, we can cannibalize another node in the btree cache @@ -187,36 +199,89 @@ struct btree_node_iter { } data[MAX_BSETS]; }; +#define BTREE_ITER_FLAGS() \ + x(slots) \ + x(intent) \ + x(prefetch) \ + x(is_extents) \ + x(not_extents) \ + x(cached) \ + x(with_key_cache) \ + x(with_updates) \ + x(with_journal) \ + x(snapshot_field) \ + x(all_snapshots) \ + x(filter_snapshots) \ + x(nopreserve) \ + x(cached_nofill) \ + x(key_cache_fill) \ + +#define STR_HASH_FLAGS() \ + x(must_create) \ + x(must_replace) + +#define BTREE_UPDATE_FLAGS() \ + x(internal_snapshot_node) \ + x(nojournal) \ + x(key_cache_reclaim) + + /* - * Iterate over all possible positions, synthesizing deleted keys for holes: - */ -static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; -/* - * Indicates that intent locks should be taken on leaf nodes, because we expect - * to be doing updates: - */ -static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1; -/* - * Causes the btree iterator code to prefetch additional btree nodes from disk: - */ -static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2; -/* - * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for - * @pos or the first key strictly greater than @pos + * BTREE_TRIGGER_norun - don't run triggers at all + * + * BTREE_TRIGGER_transactional - we're running transactional triggers as part of + * a transaction commit: triggers may generate new updates + * + * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction + * commit: we have our journal reservation, we're holding btree node write + * locks, and we know the transaction is going to commit (returning an error + * here is a fatal error, causing us to go emergency read-only) + * + * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage + * + * BTREE_TRIGGER_insert - @new is entering the btree + * BTREE_TRIGGER_overwrite - @old is leaving the btree + * + * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc + * trigger */ -static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3; -static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4; -static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5; -static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6; -static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7; -static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8; -static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9; -static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11; -static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12; -static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13; -static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14; -#define __BTREE_ITER_FLAGS_END 15 +#define BTREE_TRIGGER_FLAGS() \ + x(norun) \ + x(transactional) \ + x(atomic) \ + x(check_repair) \ + x(gc) \ + x(insert) \ + x(overwrite) \ + x(is_root) \ + x(bucket_invalidate) + +enum { +#define x(n) BTREE_ITER_FLAG_BIT_##n, + BTREE_ITER_FLAGS() + STR_HASH_FLAGS() + BTREE_UPDATE_FLAGS() + BTREE_TRIGGER_FLAGS() +#undef x +}; + +/* iter flags must fit in a u16: */ +//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15); + +enum btree_iter_update_trigger_flags { +#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_ITER_FLAGS() +#undef x +#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + STR_HASH_FLAGS() +#undef x +#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_UPDATE_FLAGS() +#undef x +#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_TRIGGER_FLAGS() +#undef x +}; enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -307,7 +372,7 @@ struct btree_iter { */ struct bkey k; - /* BTREE_ITER_WITH_JOURNAL: */ + /* BTREE_ITER_with_journal: */ size_t journal_idx; #ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; @@ -418,6 +483,8 @@ struct btree_trans { u8 lock_must_abort; bool lock_may_not_fail:1; bool srcu_held:1; + bool locked:1; + bool write_locked:1; bool used_mempool:1; bool in_traverse_all:1; bool paths_sorted:1; @@ -425,13 +492,13 @@ struct btree_trans { bool journal_transaction_names:1; bool journal_replay_not_finished:1; bool notrace_relock_fail:1; - bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; + unsigned long last_unlock_ip; unsigned long srcu_lock_time; const char *fn; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 8e47e260eba5..f3c645a43dcb 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -25,14 +25,14 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, static int __must_check bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, - struct bkey_i *, enum btree_update_flags, + struct bkey_i *, enum btree_iter_update_trigger_flags, unsigned long ip); static noinline int extent_front_merge(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct bkey_i **insert, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_i *update; @@ -104,8 +104,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, pos.snapshot++; for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOPRESERVE, k, ret) { + BTREE_ITER_all_snapshots| + BTREE_ITER_nopreserve, k, ret) { if (!bkey_eq(k.k->p, pos)) break; @@ -138,8 +138,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, darray_init(&s); bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); while ((old_k = bch2_btree_iter_prev(&old_iter)).k && !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { @@ -151,8 +151,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, continue; new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); + BTREE_ITER_not_extents| + BTREE_ITER_intent); ret = bkey_err(new_k); if (ret) break; @@ -168,7 +168,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, update->k.type = KEY_TYPE_whiteout; ret = bch2_trans_update(trans, &new_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } bch2_trans_iter_exit(trans, &new_iter); @@ -185,7 +185,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, int bch2_trans_update_extent_overwrite(struct btree_trans *trans, struct btree_iter *iter, - enum btree_update_flags flags, + enum btree_iter_update_trigger_flags flags, struct bkey_s_c old, struct bkey_s_c new) { @@ -218,7 +218,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, ret = bch2_insert_snapshot_whiteouts(trans, btree_id, old.k->p, update->k.p) ?: bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -235,7 +235,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, ret = bch2_insert_snapshot_whiteouts(trans, btree_id, old.k->p, update->k.p) ?: bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -260,7 +260,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, } ret = bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -273,7 +273,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, bch2_cut_front(new.k->p, update); ret = bch2_trans_update_by_path(trans, iter->path, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_internal_snapshot_node| flags, _RET_IP_); if (ret) return ret; @@ -285,7 +285,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, static int bch2_trans_update_extent(struct btree_trans *trans, struct btree_iter *orig_iter, struct bkey_i *insert, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; struct bkey_s_c k; @@ -293,9 +293,9 @@ static int bch2_trans_update_extent(struct btree_trans *trans, int ret = 0; bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_NOT_EXTENTS); + BTREE_ITER_intent| + BTREE_ITER_with_updates| + BTREE_ITER_not_extents); k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; @@ -346,7 +346,7 @@ err: static noinline int flush_new_cached_update(struct btree_trans *trans, struct btree_insert_entry *i, - enum btree_update_flags flags, + enum btree_iter_update_trigger_flags flags, unsigned long ip) { struct bkey k; @@ -354,7 +354,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, btree_path_idx_t path_idx = bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); + BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, path_idx, 0); if (ret) goto out; @@ -372,7 +372,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, goto out; i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_NORUN; + i->flags |= BTREE_TRIGGER_norun; btree_path_set_should_be_locked(btree_path); ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); @@ -383,7 +383,7 @@ out: static int __must_check bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - struct bkey_i *k, enum btree_update_flags flags, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, unsigned long ip) { struct bch_fs *c = trans->c; @@ -479,15 +479,15 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT| - BTREE_ITER_CACHED, _THIS_IP_); + BTREE_ITER_intent| + BTREE_ITER_cached, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached); if (unlikely(ret)) return ret; @@ -505,17 +505,17 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, } int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_update_flags flags) + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; - if (iter->flags & BTREE_ITER_IS_EXTENTS) + if (iter->flags & BTREE_ITER_is_extents) return bch2_trans_update_extent(trans, iter, k, flags); if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && - (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + !(flags & BTREE_UPDATE_key_cache_reclaim) && + (iter->flags & BTREE_ITER_filter_snapshots)) { ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); if (unlikely(ret < 0)) return ret; @@ -528,7 +528,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter * Ensure that updates to cached btrees go to the key cache: */ struct btree_path *path = trans->paths + path_idx; - if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + if (!(flags & BTREE_UPDATE_key_cache_reclaim) && !path->cached && !path->level && btree_id_cached(trans->c, path->btree_id)) { @@ -587,7 +587,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); k = bch2_btree_iter_prev(iter); ret = bkey_err(k); if (ret) @@ -621,15 +621,15 @@ void bch2_trans_commit_hook(struct btree_trans *trans, int bch2_btree_insert_nonextent(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; int ret; bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_CACHED| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); + BTREE_ITER_cached| + BTREE_ITER_not_extents| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); @@ -637,16 +637,13 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, } int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_update_flags flags) + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; - int ret; - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, flags); + BTREE_ITER_intent|flags); + int ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -698,8 +695,8 @@ int bch2_btree_delete(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, btree, pos, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); + BTREE_ITER_cached| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(trans, &iter, update_flags); bch2_trans_iter_exit(trans, &iter); @@ -717,7 +714,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); @@ -745,7 +742,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, */ delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) + if (iter.flags & BTREE_ITER_is_extents) bch2_key_resize(&delete.k, bpos_min(end, k.k->p).offset - iter.pos.offset); @@ -804,7 +801,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, k->k.p = pos; struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, k, 0); @@ -852,7 +849,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, if (ret) goto err; - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + if (!test_bit(JOURNAL_running, &c->journal.flags)) { ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); if (ret) goto err; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index cc7c53e83f89..b4894e4d5447 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -44,16 +44,18 @@ enum bch_trans_commit_flags { #undef x }; +void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); + int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, - struct bkey_i *, enum btree_update_flags); + struct bkey_i *, enum btree_iter_update_trigger_flags); int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, - enum btree_update_flags); + enum btree_iter_update_trigger_flags); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, int flags); @@ -94,14 +96,14 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, } int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, - enum btree_update_flags, + enum btree_iter_update_trigger_flags, struct bkey_s_c, struct bkey_s_c); int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos); int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); + struct bkey_i *, enum btree_iter_update_trigger_flags); struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); @@ -276,7 +278,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr unsigned flags, unsigned type, unsigned min_bytes) { struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, - btree_id, pos, flags|BTREE_ITER_INTENT, type); + btree_id, pos, flags|BTREE_ITER_intent, type); struct bkey_i *ret = IS_ERR(k.k) ? ERR_CAST(k.k) : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); @@ -299,7 +301,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, unsigned flags, unsigned type, unsigned min_bytes) { struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, - btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes); + btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); int ret; if (IS_ERR(mut)) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b4efd8cc4d1a..60b8544cea48 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -38,22 +38,6 @@ static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) -{ - btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, - BTREE_ITER_NOPRESERVE| - BTREE_ITER_INTENT, _RET_IP_); - path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); - - struct btree_path *path = trans->paths + path_idx; - bch2_btree_path_downgrade(trans, path); - __bch2_btree_path_unlock(trans, path); - return path_idx; -} - /* * Verify that child nodes correctly span parent node's range: */ @@ -73,6 +57,24 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); + if (b == btree_node_root(c, b)) { + if (!bpos_eq(b->data->min_key, POS_MIN)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); + need_fsck_err(c, btree_root_bad_min_key, + "btree root with incorrect min_key: %s", buf.buf); + goto topology_repair; + } + + if (!bpos_eq(b->data->max_key, SPOS_MAX)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); + need_fsck_err(c, btree_root_bad_max_key, + "btree root with incorrect max_key: %s", buf.buf); + goto topology_repair; + } + } + if (!b->c.level) return 0; @@ -158,7 +160,6 @@ topology_repair: static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) { struct bkey_packed *k; - struct bset_tree *t; struct bkey uk; for_each_bset(b, t) @@ -646,7 +647,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -655,7 +656,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -735,9 +736,6 @@ err: */ b = READ_ONCE(as->b); if (b) { - btree_path_idx_t path_idx = get_unlocked_mut_path(trans, - as->btree_id, b->c.level, b->key.k.p); - struct btree_path *path = trans->paths + path_idx; /* * @b is the node we did the final insert into: * @@ -755,12 +753,16 @@ err: * btree_node_lock_nopath() (the use of which is always suspect, * we need to work on removing this in the future) * - * It should be, but get_unlocked_mut_path() -> bch2_path_get() + * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() * calls bch2_path_upgrade(), before we call path_make_mut(), so * we may rarely end up with a locked path besides the one we * have here: */ bch2_trans_unlock(trans); + bch2_trans_begin(trans); + btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, + as->btree_id, b->c.level, b->key.k.p); + struct btree_path *path = trans->paths + path_idx; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); @@ -1154,13 +1156,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags |= watermark; if (watermark < BCH_WATERMARK_reclaim && - test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) { + test_bit(JOURNAL_space_low, &c->journal.flags)) { if (flags & BCH_TRANS_COMMIT_journal_reclaim) return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - bch2_trans_unlock(trans); - wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)); - ret = bch2_trans_relock(trans); + ret = drop_locks_do(trans, + ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); if (ret) return ERR_PTR(ret); } @@ -1206,7 +1207,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->start_time = start_time; as->ip_started = _RET_IP_; as->mode = BTREE_UPDATE_none; - as->watermark = watermark; + as->flags = flags; as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level_start = level_start; @@ -1360,7 +1361,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && !btree_ptr_sectors_written(insert)); - if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), @@ -1619,12 +1620,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); + path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path1, n1); - path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p); + path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path2, n2); @@ -1669,7 +1670,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n1); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); + path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path1, n1); @@ -1947,6 +1948,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked(trans); BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); @@ -1979,7 +1982,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, : bpos_successor(b->data->max_key); sib_path = bch2_path_get(trans, btree, sib_pos, - U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); + U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; @@ -2072,7 +2075,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p); + new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); @@ -2150,7 +2153,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); + new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); @@ -2333,10 +2336,10 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, if (!skip_triggers) { ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key), - BTREE_TRIGGER_TRANSACTIONAL) ?: + BTREE_TRIGGER_transactional) ?: bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, bkey_i_to_s(new_key), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -2353,7 +2356,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bch2_trans_copy_iter(&iter2, iter); iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_INTENT, + iter2.flags & BTREE_ITER_intent, _THIS_IP_); struct btree_path *path2 = btree_iter_path(trans, &iter2); @@ -2365,7 +2368,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, trans->paths_sorted = false; ret = bch2_btree_iter_traverse(&iter2) ?: - bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); if (ret) goto err; } else { @@ -2473,7 +2476,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter); if (ret) goto out; @@ -2487,7 +2490,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, BUG_ON(!btree_node_hashed(b)); - struct bch_extent_ptr *ptr; bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); @@ -2511,7 +2513,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) +int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2559,17 +2561,18 @@ static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); + bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level)); } static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) { - prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - (void *) as->ip_started, + prt_printf(out, "%ps: ", (void *) as->ip_started); + bch2_trans_commit_flags_to_text(out, as->flags); + + prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_id_str(as->btree_id), as->update_level_start, as->update_level_end, - bch2_watermarks[as->watermark], bch2_btree_update_modes[as->mode], as->nodes_written, closure_nr_remaining(&as->cl), diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index c1a479ebaad1..b5b76ce01cfc 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -52,7 +52,7 @@ struct btree_update { struct list_head unwritten_list; enum btree_update_mode mode; - enum bch_watermark watermark; + enum bch_trans_commit_flags flags; unsigned nodes_written:1; unsigned took_gc_lock:1; @@ -144,6 +144,9 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, EBUG_ON(!btree_node_locked(path, level)); + if (bch2_btree_node_merging_disabled) + return 0; + b = path->l[level].b; if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; @@ -172,6 +175,8 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, unsigned, bool); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); + +int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned); void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); static inline unsigned btree_update_reserve_required(struct bch_fs *c, diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 36a6f42aba5e..75c8a196b3f6 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -122,7 +122,7 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans, trans->journal_res.seq = wb->journal_seq; return bch2_trans_update(trans, iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| @@ -191,13 +191,13 @@ btree_write_buffered_insert(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), - BTREE_ITER_CACHED|BTREE_ITER_INTENT); + BTREE_ITER_cached|BTREE_ITER_intent); trans->journal_res.seq = wb->journal_seq; ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -332,7 +332,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, - BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_intent|BTREE_ITER_all_snapshots); } bch2_btree_iter_set_pos(&iter, k->k.k.p); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 82f179258867..e28d28ac2a13 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -274,25 +274,14 @@ void bch2_dev_usage_init(struct bch_dev *ca) void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) { - prt_tab(out); - prt_str(out, "buckets"); - prt_tab_rjust(out); - prt_str(out, "sectors"); - prt_tab_rjust(out); - prt_str(out, "fragmented"); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); for (unsigned i = 0; i < BCH_DATA_NR; i++) { bch2_prt_data_type(out, i); - prt_tab(out); - prt_u64(out, usage->d[i].buckets); - prt_tab_rjust(out); - prt_u64(out, usage->d[i].sectors); - prt_tab_rjust(out); - prt_u64(out, usage->d[i].fragmented); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\t%llu\r%llu\r%llu\r\n", + usage->d[i].buckets, + usage->d[i].sectors, + usage->d[i].fragmented); } } @@ -329,26 +318,6 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, preempt_enable(); } -static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) -{ - return (struct bch_alloc_v4) { - .gen = b.gen, - .data_type = b.data_type, - .dirty_sectors = b.dirty_sectors, - .cached_sectors = b.cached_sectors, - .stripe = b.stripe, - }; -} - -void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, - struct bucket *old, struct bucket *new) -{ - struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old); - struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new); - - bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true); -} - static inline int __update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, struct bch_replicas_entry_v1 *r, @@ -496,78 +465,276 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 return bch2_update_replicas_list(trans, &r.e, sectors); } -int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type data_type, - unsigned sectors, struct gc_pos pos, - unsigned flags) +int bch2_check_fix_ptrs(struct btree_trans *trans, + enum btree_id btree, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { - struct bucket old, new, *g; + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry_c; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + struct printbuf buf = PRINTBUF; int ret = 0; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - BUG_ON(data_type != BCH_DATA_sb && - data_type != BCH_DATA_journal); + percpu_down_read(&c->mark_lock); - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; + rcu_read_lock(); + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (!ca) { + if (fsck_err(c, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + continue; + } - percpu_down_read(&c->mark_lock); - g = gc_bucket(ca, b); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c); + + if (fsck_err_on(!g->gen_valid, + c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { + do_update = true; + } + } - bucket_lock(g); - old = *g; + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached && + (g->data_type != BCH_DATA_btree || + data_type == BCH_DATA_btree)) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + do_update = true; + } + } - if (bch2_fs_inconsistent_on(g->data_type && - g->data_type != data_type, c, - "different types of data in same bucket: %s, %s", - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) { - ret = -EIO; - goto err; - } + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + continue; + + if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), + c, ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = data_type; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + do_update = true; + } + } - if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", - ca->dev_idx, b, g->gen, - bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) { - ret = -EIO; - goto err; + if (p.has_ec) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, + ptr_to_missing_stripe, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, + ptr_to_incorrect_stripe, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + } } + rcu_read_unlock(); - g->data_type = data_type; - g->dirty_sectors += sectors; - new = *g; + if (do_update) { + if (flags & BTREE_TRIGGER_is_root) { + bch_err(c, "cannot update btree roots yet"); + ret = -EINVAL; + goto err; + } + + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + rcu_read_lock(); + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev)); + rcu_read_unlock(); + + if (level) { + /* + * We don't want to drop btree node pointers - if the + * btree node isn't there anymore, the read path will + * sort it out: + */ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + rcu_read_lock(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); + + ptr->gen = g->gen; + } + rcu_read_unlock(); + } else { + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + rcu_read_lock(); + bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); + + if ((p.ptr.cached && + (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || + (!p.ptr.cached && + gen_cmp(p.ptr.gen, g->gen) < 0) || + gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type)) { + bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); + goto restart_drop_ptrs; + } + } + rcu_read_unlock(); +again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + + bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) + if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) + goto found; + next_ptr = NULL; +found: + if (!next_ptr) { + bch_err(c, "aieee, found stripe ptr with no data ptr"); + continue; + } + + if (!m || !m->alive || + !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], + &next_ptr->ptr, + m->sectors)) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } + } + } + } + + if (0) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, k); + bch_info(c, "updated %s", buf.buf); + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf.buf); + } + + percpu_up_read(&c->mark_lock); + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun); + bch2_trans_iter_exit(trans, &iter); + percpu_down_read(&c->mark_lock); + + if (ret) + goto err; + + if (level) + bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + } err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, &old, &new); +fsck_err: percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); return ret; } -int bch2_check_bucket_ref(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 bucket_sectors) +int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, + u32 *bucket_sectors) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); struct printbuf buf = PRINTBUF; + bool inserting = sectors > 0; int ret = 0; - if (bucket_data_type == BCH_DATA_cached) - bucket_data_type = BCH_DATA_user; - - if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || - (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) - bucket_data_type = ptr_data_type = BCH_DATA_stripe; + BUG_ON(!sectors); if (gen_after(ptr->gen, b_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, @@ -578,8 +745,9 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; } if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { @@ -592,11 +760,17 @@ int bch2_check_bucket_ref(struct btree_trans *trans, ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; + } + + if (b_gen != ptr->gen && ptr->cached) { + ret = 1; + goto out; } - if (b_gen != ptr->gen && !ptr->cached) { + if (b_gen != ptr->gen) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, BCH_FSCK_ERR_stale_dirty_ptr, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" @@ -607,18 +781,12 @@ int bch2_check_bucket_ref(struct btree_trans *trans, ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; - } - - if (b_gen != ptr->gen) { - ret = 1; + if (inserting) + goto err; goto out; } - if (!data_type_is_empty(bucket_data_type) && - ptr_data_type && - bucket_data_type != ptr_data_type) { + if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" @@ -628,28 +796,33 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; } - if ((u64) bucket_sectors + sectors > U32_MAX) { + if ((u64) *bucket_sectors + sectors > U32_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, BCH_FSCK_ERR_bucket_sector_count_overflow, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - bucket_sectors, sectors, + *bucket_sectors, sectors, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + sectors = -*bucket_sectors; } + + *bucket_sectors += sectors; out: printbuf_exit(&buf); return ret; err: bch2_dump_trans_updates(trans); + ret = -EIO; goto out; } @@ -786,29 +959,22 @@ need_mark: /* KEY_TYPE_extent: */ -static int __mark_pointer(struct btree_trans *trans, +static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 *bucket_data_type, - u32 *dirty_sectors, u32 *cached_sectors) + struct bch_alloc_v4 *a) { u32 *dst_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; - int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, - bucket_gen, *bucket_data_type, *dst_sectors); + ? &a->dirty_sectors + : &a->cached_sectors; + int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type, + a->gen, a->data_type, dst_sectors); if (ret) return ret; - *dst_sectors += sectors; - - if (!*dirty_sectors && !*cached_sectors) - *bucket_data_type = 0; - else if (*bucket_data_type != BCH_DATA_stripe) - *bucket_data_type = ptr_data_type; - + alloc_data_type_set(a, ptr_data_type); return 0; } @@ -816,81 +982,69 @@ static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, - s64 *sectors, unsigned flags) + s64 *sectors, + enum btree_iter_update_trigger_flags flags) { - bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); + bool insert = !(flags & BTREE_TRIGGER_overwrite); + int ret = 0; + + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (unlikely(!ca)) { + if (insert) + ret = -EIO; + goto err; + } + struct bpos bucket; struct bch_backpointer bp; - - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp); + bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp); *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { - struct btree_iter iter; - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket); - int ret = PTR_ERR_OR_ZERO(a); + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v); if (ret) - return ret; - - ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type, - a->v.gen, &a->v.data_type, - &a->v.dirty_sectors, &a->v.cached_sectors) ?: - bch2_trans_update(trans, &iter, &a->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; + goto err; if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); + ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert); if (ret) - return ret; + goto err; } } - if (flags & BTREE_TRIGGER_GC) { - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - + if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + struct bucket *g = gc_bucket(ca, bucket.offset); bucket_lock(g); - struct bucket old = *g; - - u8 bucket_data_type = g->data_type; - int ret = __mark_pointer(trans, k, &p.ptr, *sectors, - data_type, g->gen, - &bucket_data_type, - &g->dirty_sectors, - &g->cached_sectors); - if (ret) { - bucket_unlock(g); - percpu_up_read(&c->mark_lock); - return ret; + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; + ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new); + if (!ret) { + alloc_to_bucket(g, new); + bch2_dev_usage_update(c, ca, &old, &new, 0, true); } - - g->data_type = bucket_data_type; - struct bucket new = *g; bucket_unlock(g); - bch2_dev_usage_update_m(c, ca, &old, &new); percpu_up_read(&c->mark_lock); } - - return 0; +err: + bch2_dev_put(ca); + return ret; } static int bch2_trigger_stripe_ptr(struct btree_trans *trans, struct bkey_s_c k, struct extent_ptr_decoded p, enum bch_data_type data_type, - s64 sectors, unsigned flags) + s64 sectors, + enum btree_iter_update_trigger_flags flags) { - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct btree_iter iter; struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_WITH_UPDATES, stripe); + BTREE_ITER_with_updates, stripe); int ret = PTR_ERR_OR_ZERO(s); if (unlikely(ret)) { bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, @@ -920,10 +1074,10 @@ err: return ret; } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct bch_fs *c = trans->c; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); + BUG_ON(!(flags & BTREE_TRIGGER_gc)); struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { @@ -959,9 +1113,10 @@ err: static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { - bool gc = flags & BTREE_TRIGGER_GC; + bool gc = flags & BTREE_TRIGGER_gc; struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -970,7 +1125,7 @@ static int __trigger_extent(struct btree_trans *trans, enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 dirty_sectors = 0; + s64 replicas_sectors = 0; int ret = 0; r.e.data_type = data_type; @@ -996,7 +1151,7 @@ static int __trigger_extent(struct btree_trans *trans, return ret; } } else if (!p.has_ec) { - dirty_sectors += disk_sectors; + replicas_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -1014,8 +1169,8 @@ static int __trigger_extent(struct btree_trans *trans, if (r.e.nr_devs) { ret = !gc - ? bch2_update_replicas_list(trans, &r.e, dirty_sectors) - : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true); + ? bch2_update_replicas_list(trans, &r.e, replicas_sectors) + : bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true); if (unlikely(ret && gc)) { struct printbuf buf = PRINTBUF; @@ -1031,15 +1186,18 @@ static int __trigger_extent(struct btree_trans *trans, } int bch2_trigger_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; + if (unlikely(flags & BTREE_TRIGGER_check_repair)) + return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags); + /* if pointers aren't changing - nothing to do: */ if (new_ptrs_bytes == old_ptrs_bytes && !memcmp(new_ptrs.start, @@ -1047,7 +1205,7 @@ int bch2_trigger_extent(struct btree_trans *trans, new_ptrs_bytes)) return 0; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct bch_fs *c = trans->c; int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - (int) bch2_bkey_needs_rebalance(c, old); @@ -1060,8 +1218,8 @@ int bch2_trigger_extent(struct btree_trans *trans, } } - if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC)) - return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags); + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) + return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags); return 0; } @@ -1069,17 +1227,17 @@ int bch2_trigger_extent(struct btree_trans *trans, /* KEY_TYPE_reservation */ static int __trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + enum btree_id btree_id, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size * replicas; - if (flags & BTREE_TRIGGER_OVERWRITE) + if (flags & BTREE_TRIGGER_overwrite) sectors = -sectors; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { int ret = bch2_replicas_deltas_realloc(trans, 0); if (ret) return ret; @@ -1090,7 +1248,7 @@ static int __trigger_reservation(struct btree_trans *trans, d->persistent_reserved[replicas - 1] += sectors; } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); preempt_disable(); @@ -1110,7 +1268,7 @@ static int __trigger_reservation(struct btree_trans *trans, int bch2_trigger_reservation(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); } @@ -1118,22 +1276,16 @@ int bch2_trigger_reservation(struct btree_trans *trans, /* Mark superblocks: */ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, size_t b, + struct bch_dev *ca, u64 b, enum bch_data_type type, unsigned sectors) { struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_i_alloc_v4 *a; int ret = 0; - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; - - a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b)); if (IS_ERR(a)) return PTR_ERR(a); @@ -1161,20 +1313,75 @@ err: return ret; } +static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 b, enum bch_data_type data_type, unsigned sectors, + enum btree_iter_update_trigger_flags flags) +{ + int ret = 0; + + percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, b); + + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g); + + if (bch2_fs_inconsistent_on(g->data_type && + g->data_type != data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) { + ret = -EIO; + goto err; + } + + if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, + "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", + ca->dev_idx, b, g->gen, + bch2_data_type_str(g->data_type ?: data_type), + g->dirty_sectors, sectors)) { + ret = -EIO; + goto err; + } + + g->data_type = data_type; + g->dirty_sectors += sectors; + struct bch_alloc_v4 new = bucket_m_to_alloc(*g); +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update(c, ca, &old, &new, 0, true); + percpu_up_read(&c->mark_lock); + return ret; +} + int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, size_t b, - enum bch_data_type type, - unsigned sectors) + struct bch_dev *ca, u64 b, + enum bch_data_type type, unsigned sectors, + enum btree_iter_update_trigger_flags flags) { - return commit_do(trans, NULL, NULL, 0, - __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); + BUG_ON(type != BCH_DATA_free && + type != BCH_DATA_sb && + type != BCH_DATA_journal); + + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return 0; + + if (flags & BTREE_TRIGGER_gc) + return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags); + else if (flags & BTREE_TRIGGER_transactional) + return commit_do(trans, NULL, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); + else + BUG(); } static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, - struct bch_dev *ca, - u64 start, u64 end, - enum bch_data_type type, - u64 *bucket, unsigned *bucket_sectors) + struct bch_dev *ca, u64 start, u64 end, + enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors, + enum btree_iter_update_trigger_flags flags) { do { u64 b = sector_to_bucket(ca, start); @@ -1183,7 +1390,7 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, if (b != *bucket && *bucket_sectors) { int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, - type, *bucket_sectors); + type, *bucket_sectors, flags); if (ret) return ret; @@ -1198,8 +1405,8 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, return 0; } -static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, - struct bch_dev *ca) +static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, + enum btree_iter_update_trigger_flags flags) { struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; u64 bucket = 0; @@ -1212,21 +1419,21 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, if (offset == BCH_SB_SECTOR) { ret = bch2_trans_mark_metadata_sectors(trans, ca, 0, BCH_SB_SECTOR, - BCH_DATA_sb, &bucket, &bucket_sectors); + BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, offset + (1 << layout->sb_max_size_bits), - BCH_DATA_sb, &bucket, &bucket_sectors); + BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } if (bucket_sectors) { ret = bch2_trans_mark_metadata_bucket(trans, ca, - bucket, BCH_DATA_sb, bucket_sectors); + bucket, BCH_DATA_sb, bucket_sectors, flags); if (ret) return ret; } @@ -1234,7 +1441,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, for (i = 0; i < ca->journal.nr; i++) { ret = bch2_trans_mark_metadata_bucket(trans, ca, ca->journal.buckets[i], - BCH_DATA_journal, ca->mi.bucket_size); + BCH_DATA_journal, ca->mi.bucket_size, flags); if (ret) return ret; } @@ -1242,20 +1449,22 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, return 0; } -int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, + enum btree_iter_update_trigger_flags flags) { - int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); - + int ret = bch2_trans_run(c, + __bch2_trans_mark_dev_sb(trans, ca, flags)); bch_err_fn(c, ret); return ret; } -int bch2_trans_mark_dev_sbs(struct bch_fs *c) +int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, + enum btree_iter_update_trigger_flags flags) { for_each_online_member(c, ca) { - int ret = bch2_trans_mark_dev_sb(c, ca); + int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } } @@ -1263,6 +1472,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) return 0; } +int bch2_trans_mark_dev_sbs(struct bch_fs *c) +{ + return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 @@ -1331,6 +1545,31 @@ recalculate: /* Startup/shutdown: */ +void bch2_buckets_nouse_free(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + kvfree_rcu_mightsleep(ca->buckets_nouse); + ca->buckets_nouse = NULL; + } +} + +int bch2_buckets_nouse_alloc(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + BUG_ON(ca->buckets_nouse); + + ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets_nouse) { + bch2_dev_put(ca); + return -BCH_ERR_ENOMEM_buckets_nouse; + } + } + + return 0; +} + static void bucket_gens_free_rcu(struct rcu_head *rcu) { struct bucket_gens *buckets = @@ -1342,24 +1581,17 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; - unsigned long *buckets_nouse = NULL; bool resize = ca->bucket_gens != NULL; int ret; + BUG_ON(resize && ca->buckets_nouse); + if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, GFP_KERNEL|__GFP_ZERO))) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } - if ((c->opts.buckets_nouse && - !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)))) { - ret = -BCH_ERR_ENOMEM_buckets_nouse; - goto err; - } - bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; @@ -1377,17 +1609,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(bucket_gens->b, old_bucket_gens->b, n); - if (buckets_nouse) - memcpy(buckets_nouse, - ca->buckets_nouse, - BITS_TO_LONGS(n) * sizeof(unsigned long)); } rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; - swap(ca->buckets_nouse, buckets_nouse); - nbuckets = ca->mi.nbuckets; if (resize) { @@ -1398,7 +1624,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = 0; err: - kvfree(buckets_nouse); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index f9af5adabe83..617ffde2fb7a 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -12,7 +12,7 @@ #include "extents.h" #include "sb-members.h" -static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) +static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s) { return div_u64(s, ca->mi.bucket_size); } @@ -30,8 +30,7 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) return remainder; } -static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, - u32 *offset) +static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset) { return div_u64_rem(s, ca->mi.bucket_size, offset); } @@ -94,7 +93,7 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { struct bucket_array *buckets = gc_bucket_array(ca); - BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + BUG_ON(!bucket_valid(ca, b)); return buckets->b + b; } @@ -111,7 +110,7 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) { struct bucket_gens *gens = bucket_gens(ca); - BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); + BUG_ON(!bucket_valid(ca, b)); return gens->b + b; } @@ -121,20 +120,16 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, return sector_to_bucket(ca, ptr->offset); } -static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, - const struct bch_extent_ptr *ptr) +static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); } -static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, +static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca, const struct bch_extent_ptr *ptr, u32 *bucket_offset) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); } @@ -175,17 +170,19 @@ static inline int gen_after(u8 a, u8 b) return r > 0 ? r : 0; } +static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +{ + return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); +} + /** - * ptr_stale() - check if a pointer points into a bucket that has been + * dev_ptr_stale() - check if a pointer points into a bucket that has been * invalidated. */ -static inline u8 ptr_stale(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) +static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - u8 ret; - rcu_read_lock(); - ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + u8 ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); return ret; @@ -306,8 +303,6 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *, const struct bch_alloc_v4 *, const struct bch_alloc_v4 *, u64, bool); -void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *, - struct bucket *, struct bucket *); /* key/bucket marking: */ @@ -333,27 +328,29 @@ int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); void bch2_fs_usage_initialize(struct bch_fs *); -int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c, - const struct bch_extent_ptr *, - s64, enum bch_data_type, u8, u8, u32); +int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *, + struct bkey_s_c, const struct bch_extent_ptr *, + s64, enum bch_data_type, u8, u8, u32 *); -int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, - size_t, enum bch_data_type, unsigned, - struct gc_pos, unsigned); +int bch2_check_fix_ptrs(struct btree_trans *, + enum btree_id, unsigned, struct bkey_s_c, + enum btree_iter_update_trigger_flags); int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ ({ \ int ret = 0; \ \ if (_old.k->type) \ - ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ + ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \ if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\ + ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\ ret; \ }) @@ -362,9 +359,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *); void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); -int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, - size_t, enum bch_data_type, unsigned); -int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64, + enum bch_data_type, unsigned, + enum btree_iter_update_trigger_flags); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *, + enum btree_iter_update_trigger_flags); +int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, + enum btree_iter_update_trigger_flags); int bch2_trans_mark_dev_sbs(struct bch_fs *); static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) @@ -464,6 +465,9 @@ static inline u64 avail_factor(u64 r) return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); } +void bch2_buckets_nouse_free(struct bch_fs *); +int bch2_buckets_nouse_alloc(struct bch_fs *); + int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); void bch2_dev_buckets_free(struct bch_dev *); int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 4d14f19f5185..9e54323f0f5f 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -32,12 +32,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, if (dev >= c->sb.nr_devices) return ERR_PTR(-EINVAL); - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - + ca = bch2_dev_tryget_noerror(c, dev); if (!ca) return ERR_PTR(-EINVAL); } else { @@ -391,7 +386,7 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) return PTR_ERR(ca); ret = bch2_dev_offline(c, ca, arg.flags); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -420,7 +415,7 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, if (ret) bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -615,7 +610,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.d[i].fragmented = src.d[i].fragmented; } - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); } @@ -667,7 +662,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, goto err; } err: - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -689,11 +684,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c, if (arg.flags & BCH_READ_DEV) { ca = bch2_device_lookup(c, arg.dev, arg.flags); - - if (IS_ERR(ca)) { - ret = PTR_ERR(ca); - goto err; - } + ret = PTR_ERR_OR_ZERO(ca); + if (ret) + goto err_unlock; sb = ca->disk_sb.sb; } else { @@ -708,8 +701,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c, ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, vstruct_bytes(sb)); err: - if (!IS_ERR_OR_NULL(ca)) - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); +err_unlock: mutex_unlock(&c->sb_lock); return ret; } @@ -753,7 +746,7 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, ret = bch2_dev_resize(c, ca, arg.nbuckets); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -779,7 +772,7 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -961,7 +954,9 @@ static const struct file_operations bch_chardev_fops = { }; static int bch_chardev_major; -static struct class *bch_chardev_class; +static const struct class bch_chardev_class = { + .name = "bcachefs", +}; static struct device *bch_chardev; void bch2_fs_chardev_exit(struct bch_fs *c) @@ -978,7 +973,7 @@ int bch2_fs_chardev_init(struct bch_fs *c) if (c->minor < 0) return c->minor; - c->chardev = device_create(bch_chardev_class, NULL, + c->chardev = device_create(&bch_chardev_class, NULL, MKDEV(bch_chardev_major, c->minor), c, "bcachefs%u-ctl", c->minor); if (IS_ERR(c->chardev)) @@ -989,32 +984,39 @@ int bch2_fs_chardev_init(struct bch_fs *c) void bch2_chardev_exit(void) { - if (!IS_ERR_OR_NULL(bch_chardev_class)) - device_destroy(bch_chardev_class, - MKDEV(bch_chardev_major, U8_MAX)); - if (!IS_ERR_OR_NULL(bch_chardev_class)) - class_destroy(bch_chardev_class); + device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX)); + class_unregister(&bch_chardev_class); if (bch_chardev_major > 0) unregister_chrdev(bch_chardev_major, "bcachefs"); } int __init bch2_chardev_init(void) { + int ret; + bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); if (bch_chardev_major < 0) return bch_chardev_major; - bch_chardev_class = class_create("bcachefs"); - if (IS_ERR(bch_chardev_class)) - return PTR_ERR(bch_chardev_class); + ret = class_register(&bch_chardev_class); + if (ret) + goto major_out; - bch_chardev = device_create(bch_chardev_class, NULL, + bch_chardev = device_create(&bch_chardev_class, NULL, MKDEV(bch_chardev_major, U8_MAX), NULL, "bcachefs-ctl"); - if (IS_ERR(bch_chardev)) - return PTR_ERR(bch_chardev); + if (IS_ERR(bch_chardev)) { + ret = PTR_ERR(bch_chardev); + goto class_out; + } return 0; + +class_out: + class_unregister(&bch_chardev_class); +major_out: + unregister_chrdev(bch_chardev_major, "bcachefs-ctl"); + return ret; } #endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 088fd2e7bdf1..85198f391e9c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -469,9 +469,8 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, /* BCH_SB_FIELD_crypt: */ -static int bch2_sb_crypt_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); @@ -494,14 +493,10 @@ static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); - prt_newline(out); - prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); - prt_newline(out); - prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); - prt_newline(out); - prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); - prt_newline(out); + prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt)); + prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt)); + prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt)); + prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt)); } const struct bch_sb_field_ops bch_sb_field_ops_crypt = { diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0022b51ce3c0..0d807c2ce9c6 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -106,7 +106,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); while (1) { struct bkey_s_c k; @@ -203,6 +203,8 @@ restart_drop_conflicting_replicas: /* Now, drop excess replicas: */ restart_drop_extra_replicas: + + rcu_read_lock(); bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); @@ -214,6 +216,7 @@ restart_drop_extra_replicas: goto restart_drop_extra_replicas; } } + rcu_read_unlock(); /* Finally, add the pointers we just wrote: */ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) @@ -288,7 +291,7 @@ restart_drop_extra_replicas: k.k->p, insert->k.p) ?: bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| @@ -357,10 +360,11 @@ void bch2_data_update_exit(struct data_update *update) bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); if (c->opts.nocow_enabled) bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), 0); - percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref); + PTR_BUCKET_POS(ca, ptr), 0); + bch2_dev_put(ca); } bch2_bkey_buf_exit(&update->k, c); @@ -386,8 +390,10 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, while (bio_sectors(bio)) { unsigned sectors = bio_sectors(bio); + bch2_trans_begin(trans); + bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, - BTREE_ITER_SLOTS); + BTREE_ITER_slots); ret = lockrestart_do(trans, ({ k = bch2_btree_iter_peek_slot(&iter); bkey_err(k); @@ -465,7 +471,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, while (data_opts.kill_ptrs) { unsigned i = 0, drop = __fls(data_opts.kill_ptrs); - struct bch_extent_ptr *ptr; bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); data_opts.kill_ptrs ^= 1U << drop; @@ -480,15 +485,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, /* * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * (BTREE_ITER_all_snapshots iterators aren't extent iterators), * we aren't using the extent overwrite path to delete, we're * just using the normal key deletion path: */ - if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) n->k.size = 0; return bch2_trans_relock(trans) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } @@ -539,15 +544,26 @@ int bch2_data_update_init(struct btree_trans *trans, m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - bkey_for_each_ptr(ptrs, ptr) - percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); + bkey_for_each_ptr(ptrs, ptr) { + if (!bch2_dev_tryget(c, ptr->dev)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); + } + return -BCH_ERR_data_update_done; + } + } unsigned durability_have = 0, durability_removing = 0; i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); bool locked; + rcu_read_lock(); if (((1U << i) & m->data_opts.rewrite_ptrs)) { BUG_ON(p.ptr.cached); @@ -561,6 +577,7 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } + rcu_read_unlock(); /* * op->csum_type is normally initialized from the fs/file's @@ -579,15 +596,13 @@ int bch2_data_update_init(struct btree_trans *trans, if (ctxt) { move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) || + bucket, 0)) || list_empty(&ctxt->ios)); if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { ret = -BCH_ERR_nocow_lock_blocked; goto err; } @@ -649,10 +664,11 @@ int bch2_data_update_init(struct btree_trans *trans, err: i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); if ((1U << i) & ptrs_locked) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); - percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + bch2_dev_put(ca); i++; } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index cd99b7399414..51cbf3928361 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -37,11 +37,11 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct btree_node *n_ondisk = c->verify_ondisk; struct btree_node *n_sorted = c->verify_data->data; struct bset *sorted, *inmemory = &b->data->keys; - struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); struct bio *bio; bool failed = false, saw_error = false; - if (!bch2_dev_get_ioref(ca, READ)) + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, @@ -194,8 +194,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); - if (!bch2_dev_get_ioref(ca, READ)) { + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) { prt_printf(out, "error getting device to read from: not online\n"); return; } @@ -375,8 +375,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ bch2_bkey_val_to_text(&i->buf, i->c, k); prt_newline(&i->buf); bch2_trans_unlock(trans); @@ -459,8 +459,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ struct btree_path_level *l = &btree_iter_path(trans, &iter)->l[0]; struct bkey_packed *_k = @@ -492,51 +492,26 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_printf(out, "%px btree=%s l=%u ", - b, - bch2_btree_id_str(b->c.btree_id), - b->c.level); - prt_newline(out); + prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level); printbuf_indent_add(out, 2); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); prt_newline(out); - prt_printf(out, "flags: "); - prt_tab(out); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_btree_node_flags, b->flags); prt_newline(out); - prt_printf(out, "pcpu read locks: "); - prt_tab(out); - prt_printf(out, "%u", b->c.lock.readers != NULL); - prt_newline(out); - - prt_printf(out, "written:"); - prt_tab(out); - prt_printf(out, "%u", b->written); - prt_newline(out); - - prt_printf(out, "writes blocked:"); - prt_tab(out); - prt_printf(out, "%u", !list_empty_careful(&b->write_blocked)); - prt_newline(out); - - prt_printf(out, "will make reachable:"); - prt_tab(out); - prt_printf(out, "%lx", b->will_make_reachable); - prt_newline(out); - - prt_printf(out, "journal pin %px:", &b->writes[0].journal); - prt_tab(out); - prt_printf(out, "%llu", b->writes[0].journal.seq); - prt_newline(out); + prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL); + prt_printf(out, "written:\t%u\n", b->written); + prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked)); + prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable); - prt_printf(out, "journal pin %px:", &b->writes[1].journal); - prt_tab(out); - prt_printf(out, "%llu", b->writes[1].journal.seq); - prt_newline(out); + prt_printf(out, "journal pin %px:\t%llu\n", + &b->writes[0].journal, b->writes[0].journal.seq); + prt_printf(out, "journal pin %px:\t%llu\n", + &b->writes[1].journal, b->writes[1].journal.seq); printbuf_indent_sub(out, 2); } @@ -625,8 +600,7 @@ restart: bch2_btree_trans_to_text(&i->buf, trans); - prt_printf(&i->buf, "backtrace:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "backtrace:\n"); printbuf_indent_add(&i->buf, 2); bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); @@ -782,25 +756,20 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, !bch2_btree_transaction_fns[i->iter]) break; - prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]); - prt_newline(&i->buf); + prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); printbuf_indent_add(&i->buf, 2); mutex_lock(&s->lock); - prt_printf(&i->buf, "Max mem used: %u", s->max_mem); - prt_newline(&i->buf); - - prt_printf(&i->buf, "Transaction duration:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->duration); printbuf_indent_sub(&i->buf, 2); if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { - prt_printf(&i->buf, "Lock hold times:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "Lock hold times:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); @@ -808,8 +777,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, } if (s->max_paths_text) { - prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths); - prt_newline(&i->buf); + prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths); printbuf_indent_add(&i->buf, 2); prt_str_indented(&i->buf, s->max_paths_text); diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index d37bd07afbfe..6bbf9a7d9e4d 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -98,7 +98,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { }; int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); @@ -118,7 +118,7 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, * Check new keys don't exceed the max length * (older keys may be larger.) */ - bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err, + bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err, dirent_name_too_long, "dirent name too big (%u > %u)", d_name.len, BCH_NAME_MAX); @@ -205,7 +205,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; struct bkey_i_dirent *dirent; @@ -220,9 +220,8 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.snapshot = snapshot; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, snapshot, - &dirent->k_i, str_hash_flags, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + dir_inum, snapshot, &dirent->k_i, + flags|BTREE_UPDATE_internal_snapshot_node); *dir_offset = dirent->k.p.offset; return ret; @@ -232,7 +231,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_i_dirent *dirent; int ret; @@ -243,7 +242,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, str_hash_flags); + dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; @@ -272,7 +271,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } else { target->subvol = le32_to_cpu(d.v->d_child_subvol); - ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); + ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s); target->inum = le64_to_cpu(s.inode); } @@ -301,13 +300,9 @@ int bch2_dirent_rename(struct btree_trans *trans, memset(dst_inum, 0, sizeof(*dst_inum)); /* Lookup src: */ - ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, - BTREE_ITER_INTENT); - if (ret) - goto out; - - old_src = bch2_btree_iter_peek_slot(&src_iter); + old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, + BTREE_ITER_intent); ret = bkey_err(old_src); if (ret) goto out; @@ -329,13 +324,9 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; } else { - ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, - BTREE_ITER_INTENT); - if (ret) - goto out; - - old_dst = bch2_btree_iter_peek_slot(&dst_iter); + old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_intent); ret = bkey_err(old_dst); if (ret) goto out; @@ -450,7 +441,7 @@ out_set_src: if (delete_src) { bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); ret = bch2_btree_iter_traverse(&src_iter) ?: - bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } @@ -458,7 +449,7 @@ out_set_src: if (delete_dst) { bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); ret = bch2_btree_iter_traverse(&dst_iter) ?: - bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } @@ -479,13 +470,9 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, const struct qstr *name, subvol_inum *inum, unsigned flags) { - int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - if (ret) - return ret; - - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); + int ret = bkey_err(k); if (ret) goto err; @@ -541,16 +528,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); } +static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target) +{ + struct qstr name = bch2_dirent_get_name(d); + bool ret = dir_emit(ctx, name.name, + name.len, + target.inum, + vfs_d_type(d.v->d_type)); + if (ret) + ctx->pos = d.k->p.offset + 1; + return ret; +} + int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_dirent dirent; subvol_inum target; u32 snapshot; struct bkey_buf sk; - struct qstr name; int ret; bch2_bkey_buf_init(&sk); @@ -567,7 +564,9 @@ retry: if (k.k->type != KEY_TYPE_dirent) continue; - dirent = bkey_s_c_to_dirent(k); + /* dir_emit() can fault and block: */ + bch2_bkey_buf_reassemble(&sk, c, k); + struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); ret = bch2_dirent_read_target(trans, inum, dirent, &target); if (ret < 0) @@ -575,28 +574,22 @@ retry: if (ret) continue; - /* dir_emit() can fault and block: */ - bch2_bkey_buf_reassemble(&sk, c, k); - dirent = bkey_i_to_s_c_dirent(sk.k); - bch2_trans_unlock(trans); - - name = bch2_dirent_get_name(dirent); - - ctx->pos = dirent.k->p.offset; - if (!dir_emit(ctx, name.name, - name.len, - target.inum, - vfs_d_type(dirent.v->d_type))) - break; - ctx->pos = dirent.k->p.offset + 1; - /* * read_target looks up subvolumes, we can overflow paths if the * directory has many subvolumes in it + * + * XXX: btree_trans_too_many_iters() is something we'd like to + * get rid of, and there's no good reason to be using it here + * except that we don't yet have a for_each_btree_key() helper + * that does subvolume_get_snapshot(). */ - ret = btree_trans_too_many_iters(trans); - if (ret) + ret = drop_locks_do(trans, + bch2_dir_emit(ctx, dirent, target)) ?: + btree_trans_too_many_iters(trans); + if (ret) { + ret = ret < 0 ? ret : 0; break; + } } bch2_trans_iter_exit(trans, &iter); err: diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index bee55cca2aa0..24037e6e0a09 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -4,11 +4,11 @@ #include "str_hash.h" -enum bkey_invalid_flags; +enum bch_validate_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ @@ -38,11 +38,11 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum, int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, - bch_str_hash_flags_t); + enum btree_iter_update_trigger_flags); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, - bch_str_hash_flags_t); + enum btree_iter_update_trigger_flags); static inline unsigned vfs_d_type(unsigned type) { diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 06a7df529b40..521a86df5e52 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -18,9 +18,8 @@ static int group_cmp(const void *_l, const void *_r) strncmp(l->label, r->label, sizeof(l->label)); } -static int bch2_sb_disk_groups_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); @@ -177,7 +176,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); struct bch_disk_group_cpu *dst; - if (!bch2_member_exists(&m)) + if (!bch2_member_alive(&m)) continue; g = BCH_MEMBER_GROUP(&m); @@ -523,7 +522,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, ca = bch2_dev_lookup(c, val); if (!IS_ERR(ca)) { *res = dev_to_target(ca->dev_idx); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return 0; } @@ -588,7 +587,7 @@ static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsi case TARGET_DEV: { struct bch_member m = bch2_sb_member_get(sb, t.dev); - if (bch2_dev_exists(sb, t.dev)) { + if (bch2_member_exists(sb, t.dev)) { prt_printf(out, "Device "); pr_uuid(out, m.uuid.b); prt_printf(out, " (%u)", t.dev); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 556a217108d3..b26dc7424662 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -107,7 +107,7 @@ struct ec_bio { /* Stripes btree keys: */ int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; @@ -163,146 +163,189 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, /* Triggers: */ -static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned idx, bool deleting) +static int __mark_stripe_bucket(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c_stripe s, + unsigned ptr_idx, bool deleting, + struct bpos bucket, + struct bch_alloc_v4 *a, + enum btree_iter_update_trigger_flags flags) { - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity : 0; - s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; + unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; + struct printbuf buf = PRINTBUF; int ret = 0; + struct bch_fs *c = trans->c; if (deleting) sectors = -sectors; - a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); - if (IS_ERR(a)) - return PTR_ERR(a); - - ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, - a->v.gen, a->v.data_type, - a->v.dirty_sectors); - if (ret) - goto err; - if (!deleting) { - if (bch2_trans_inconsistent_on(a->v.stripe || - a->v.stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - a->v.dirty_sectors, - a->v.stripe, s.k->p.offset)) { + if (bch2_trans_inconsistent_on(a->stripe || + a->stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->stripe, s.k->p.offset, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -EIO; goto err; } - if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - a->v.dirty_sectors, - s.k->p.offset)) { + if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->cached_sectors, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -EIO; goto err; } - - a->v.stripe = s.k->p.offset; - a->v.stripe_redundancy = s.v->nr_redundant; - a->v.data_type = BCH_DATA_stripe; } else { - if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || - a->v.stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", - iter.pos.inode, iter.pos.offset, a->v.gen, - s.k->p.offset, a->v.stripe)) { + if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || + a->stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", + bucket.inode, bucket.offset, a->gen, + a->stripe, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -EIO; goto err; } - a->v.stripe = 0; - a->v.stripe_redundancy = 0; - a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); + if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, + "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + bch2_data_type_str(data_type), + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err; + } + + if (bch2_trans_inconsistent_on(parity && + (a->dirty_sectors != -sectors || + a->cached_sectors), trans, + "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", + bucket.inode, bucket.offset, a->gen, + a->dirty_sectors, + a->cached_sectors, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err; + } } - a->v.dirty_sectors += sectors; - if (data_type) - a->v.data_type = !deleting ? data_type : 0; + if (sectors) { + ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, + a->gen, a->data_type, &a->dirty_sectors); + if (ret) + goto err; + } - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto err; + if (!deleting) { + a->stripe = s.k->p.offset; + a->stripe_redundancy = s.v->nr_redundant; + } else { + a->stripe = 0; + a->stripe_redundancy = 0; + } + + alloc_data_type_set(a, data_type); err: - bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; } static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c k, - unsigned ptr_idx, - unsigned flags) + struct bkey_s_c_stripe s, + unsigned ptr_idx, bool deleting, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned nr_data = s->nr_blocks - s->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; - const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket old, new, *g; - struct printbuf buf = PRINTBUF; + const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; int ret = 0; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); + struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); + if (unlikely(!ca)) { + if (!(flags & BTREE_TRIGGER_overwrite)) + ret = -EIO; + goto err; + } - /* * XXX doesn't handle deletion */ + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, ptr); + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update(trans, bucket); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); + } - if (g->dirty_sectors || - (g->stripe && g->stripe != k.k->p.offset)) { - bch2_fs_inconsistent(c, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EINVAL; - goto err; + if (flags & BTREE_TRIGGER_gc) { + percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, bucket.offset); + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; + ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); + if (!ret) { + alloc_to_bucket(g, new); + bch2_dev_usage_update(c, ca, &old, &new, 0, true); + } + bucket_unlock(g); + percpu_up_read(&c->mark_lock); } +err: + bch2_dev_put(ca); + return ret; +} - bucket_lock(g); - old = *g; +static int mark_stripe_buckets(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + enum btree_iter_update_trigger_flags flags) +{ + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; - ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type, - g->gen, g->data_type, - g->dirty_sectors); - if (ret) - goto err; + BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); - g->data_type = data_type; - g->dirty_sectors += sectors; + unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - g->stripe = k.k->p.offset; - g->stripe_redundancy = s->nr_redundant; - new = *g; -err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, &old, &new); - percpu_up_read(&c->mark_lock); - printbuf_exit(&buf); - return ret; + for (unsigned i = 0; i < nr_blocks; i++) { + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) + continue; + + if (new_s) { + int ret = mark_stripe_bucket(trans, + bkey_s_c_to_stripe(new), i, false, flags); + if (ret) + return ret; + } + + if (old_s) { + int ret = mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true, flags); + if (ret) + return ret; + } + } + + return 0; } int bch2_trigger_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_s_c new = _new.s_c; struct bch_fs *c = trans->c; @@ -312,7 +355,10 @@ int bch2_trigger_stripe(struct btree_trans *trans, const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(new).v : NULL; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (unlikely(flags & BTREE_TRIGGER_check_repair)) + return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); + + if (flags & BTREE_TRIGGER_transactional) { /* * If the pointers aren't changing, we don't need to do anything: */ @@ -347,31 +393,12 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; } - unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - for (unsigned i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - int ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(new), i, false); - if (ret) - return ret; - } - - if (old_s) { - int ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true); - if (ret) - return ret; - } - } + int ret = mark_stripe_buckets(trans, old, new, flags); + if (ret) + return ret; } - if (flags & BTREE_TRIGGER_ATOMIC) { + if (flags & BTREE_TRIGGER_atomic) { struct stripe *m = genradix_ptr(&c->stripes, idx); if (!m) { @@ -410,7 +437,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, } } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); @@ -439,13 +466,11 @@ int bch2_trigger_stripe(struct btree_trans *trans, */ memset(m->block_sectors, 0, sizeof(m->block_sectors)); - for (unsigned i = 0; i < new_s->nr_blocks; i++) { - int ret = mark_stripe_bucket(trans, new, i, flags); - if (ret) - return ret; - } + int ret = mark_stripe_buckets(trans, old, new, flags); + if (ret) + return ret; - int ret = bch2_update_replicas(c, new, &m->r.e, + ret = bch2_update_replicas(c, new, &m->r.e, ((s64) m->sectors * m->nr_redundant), 0, true); if (ret) { @@ -608,19 +633,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { - struct printbuf err = PRINTBUF; - struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); + struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); + if (ca) { + struct printbuf err = PRINTBUF; - prt_str(&err, "stripe "); - bch2_csum_err_msg(&err, v->csum_type, want, got); - prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); - bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); - bch_err_ratelimited(ca, "%s", err.buf); - printbuf_exit(&err); + prt_str(&err, "stripe "); + bch2_csum_err_msg(&err, v->csum_type, want, got); + prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); + bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); + bch_err_ratelimited(ca, "%s", err.buf); + printbuf_exit(&err); - clear_bit(i, buf->valid); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + } - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + clear_bit(i, buf->valid); break; } @@ -687,7 +714,7 @@ static void ec_block_endio(struct bio *bio) bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); - if (ptr_stale(ca, ptr)) { + if (dev_ptr_stale(ca, ptr)) { bch_err_ratelimited(ca->fs, "error %s stripe: stale pointer after io", bio_data_dir(bio) == READ ? "reading from" : "writing to"); @@ -705,25 +732,26 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned offset = 0, bytes = buf->size << 9; struct bch_extent_ptr *ptr = &v->ptrs[idx]; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant ? BCH_DATA_user : BCH_DATA_parity; int rw = op_is_write(opf); - if (ptr_stale(ca, ptr)) { - bch_err_ratelimited(c, - "error %s stripe: stale pointer", - rw == READ ? "reading from" : "writing to"); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); + if (!ca) { clear_bit(idx, buf->valid); return; } - if (!bch2_dev_get_ioref(ca, rw)) { + if (dev_ptr_stale(ca, ptr)) { + bch_err_ratelimited(c, + "error %s stripe: stale pointer", + rw == READ ? "reading from" : "writing to"); clear_bit(idx, buf->valid); return; } + this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); while (offset < bytes) { @@ -769,7 +797,7 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - POS(0, idx), BTREE_ITER_SLOTS); + POS(0, idx), BTREE_ITER_slots); ret = bkey_err(k); if (ret) goto err; @@ -1060,7 +1088,7 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1131,7 +1159,7 @@ static int ec_stripe_key_update(struct btree_trans *trans, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_INTENT); + new->k.p, BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1173,6 +1201,7 @@ err: } static int ec_stripe_update_extent(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, struct bpos *bp_pos) @@ -1183,13 +1212,13 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; const struct bch_extent_ptr *ptr_c; - struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bch_extent_ptr *ec_ptr = NULL; struct bch_extent_stripe_ptr stripe_ptr; struct bkey_i *n; int ret, dev, block; - ret = bch2_get_next_backpointer(trans, bucket, gen, - bp_pos, &bp, BTREE_ITER_CACHED); + ret = bch2_get_next_backpointer(trans, ca, bucket, gen, + bp_pos, &bp, BTREE_ITER_cached); if (ret) return ret; if (bpos_eq(*bp_pos, SPOS_MAX)) @@ -1214,7 +1243,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, return -EIO; } - k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); + k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent); ret = bkey_err(k); if (ret) return ret; @@ -1272,17 +1301,21 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b { struct bch_fs *c = trans->c; struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_extent_ptr bucket = v->ptrs[block]; - struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); + struct bch_extent_ptr ptr = v->ptrs[block]; struct bpos bp_pos = POS_MIN; int ret = 0; + struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); + if (!ca) + return -EIO; + + struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); + while (1) { ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc, - ec_stripe_update_extent(trans, bucket_pos, bucket.gen, - s, &bp_pos)); + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos)); if (ret) break; if (bkey_eq(bp_pos, POS_MAX)) @@ -1291,6 +1324,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b bp_pos = bpos_nosnap_successor(bp_pos); } + bch2_dev_put(ca); return ret; } @@ -1321,20 +1355,18 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, unsigned block, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - int ret; - - if (!bch2_dev_get_ioref(ca, WRITE)) { + struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); + if (!ca) { s->err = -BCH_ERR_erofs_no_writes; return; } + unsigned offset = ca->mi.bucket_size - ob->sectors_free; memset(s->new_stripe.data[block] + (offset << 9), 0, ob->sectors_free << 9); - ret = blkdev_issue_zeroout(ca->disk_sb.bdev, + int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, ob->bucket * ca->mi.bucket_size + offset, ob->sectors_free, GFP_KERNEL, 0); @@ -1519,16 +1551,13 @@ void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) { struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - struct bch_dev *ca; - unsigned offset; - if (!ob) return NULL; BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); - ca = bch_dev_bkey_exists(c, ob->dev); - offset = ca->mi.bucket_size - ob->sectors_free; + struct bch_dev *ca = ob_dev(c, ob); + unsigned offset = ca->mi.bucket_size - ob->sectors_free; return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } @@ -1937,7 +1966,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st } for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (bkey_gt(k.k->p, POS(0, U32_MAX))) { if (start_pos.offset) { start_pos = min_pos; @@ -2127,7 +2156,7 @@ int bch2_stripes_read(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ if (k.k->type != KEY_TYPE_stripe) continue; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index f042616888b0..84a23eeb6249 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -6,14 +6,15 @@ #include "buckets_types.h" #include "extents_types.h" -enum bkey_invalid_flags; +enum bch_validate_flags; int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ .key_invalid = bch2_stripe_invalid, \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 82a6656c941c..c66eeffcd7f2 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -176,6 +176,21 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) return s; } +/* s/fix?/fixing/ s/recreate?/recreating/ */ +static void prt_actioning(struct printbuf *out, const char *action) +{ + unsigned len = strlen(action); + + BUG_ON(action[len - 1] != '?'); + --len; + + if (action[len - 1] == 'e') + --len; + + prt_bytes(out, action, len); + prt_str(out, "ing"); +} + int bch2_fsck_err(struct bch_fs *c, enum bch_fsck_flags flags, enum bch_sb_error_id err, @@ -186,6 +201,7 @@ int bch2_fsck_err(struct bch_fs *c, bool print = true, suppressing = false, inconsistent = false; struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; + const char *action_orig = "fix?", *action = action_orig; if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) @@ -197,6 +213,19 @@ int bch2_fsck_err(struct bch_fs *c, prt_vprintf(out, fmt, args); va_end(args); + /* Custom fix/continue/recreate/etc.? */ + if (out->buf[out->pos - 1] == '?') { + const char *p = strrchr(out->buf, ','); + if (p) { + out->pos = p - out->buf; + action = kstrdup(p + 2, GFP_KERNEL); + if (!action) { + ret = -ENOMEM; + goto err; + } + } + } + mutex_lock(&c->fsck_error_msgs_lock); s = fsck_err_get(c, fmt); if (s) { @@ -208,12 +237,16 @@ int bch2_fsck_err(struct bch_fs *c, if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { ret = s->ret; mutex_unlock(&c->fsck_error_msgs_lock); - printbuf_exit(&buf); - return ret; + goto err; } kfree(s->last_msg); s->last_msg = kstrdup(buf.buf, GFP_KERNEL); + if (!s->last_msg) { + mutex_unlock(&c->fsck_error_msgs_lock); + ret = -ENOMEM; + goto err; + } if (c->opts.ratelimit_errors && !(flags & FSCK_NO_RATELIMIT) && @@ -239,7 +272,8 @@ int bch2_fsck_err(struct bch_fs *c, inconsistent = true; ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { - prt_str(out, ", fixing"); + prt_str(out, ", "); + prt_actioning(out, action); ret = -BCH_ERR_fsck_fix; } else { prt_str(out, ", continuing"); @@ -254,16 +288,16 @@ int bch2_fsck_err(struct bch_fs *c, : c->opts.fix_errors; if (fix == FSCK_FIX_ask) { - int ask; + prt_str(out, ", "); + prt_str(out, action); - prt_str(out, ": fix?"); if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", out->buf); else bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - ask = bch2_fsck_ask_yn(c); + int ask = bch2_fsck_ask_yn(c); if (ask >= YN_ALLNO && s) s->fix = ask == YN_ALLNO @@ -276,10 +310,12 @@ int bch2_fsck_err(struct bch_fs *c, } else if (fix == FSCK_FIX_yes || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { - prt_str(out, ", fixing"); + prt_str(out, ", "); + prt_actioning(out, action); ret = -BCH_ERR_fsck_fix; } else { - prt_str(out, ", not fixing"); + prt_str(out, ", not "); + prt_actioning(out, action); } } else if (flags & FSCK_NEED_FSCK) { prt_str(out, " (run fsck to correct)"); @@ -311,8 +347,6 @@ int bch2_fsck_err(struct bch_fs *c, mutex_unlock(&c->fsck_error_msgs_lock); - printbuf_exit(&buf); - if (inconsistent) bch2_inconsistent_error(c); @@ -322,7 +356,10 @@ int bch2_fsck_err(struct bch_fs *c, set_bit(BCH_FS_errors_not_fixed, &c->flags); set_bit(BCH_FS_error, &c->flags); } - +err: + if (action != action_orig) + kfree(action); + printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index b9033bb4f11c..5f4fecb358da 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -72,7 +72,7 @@ static int count_iters_for_insert(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, BTREE_ID_reflink, POS(0, idx + offset), - BTREE_ITER_SLOTS, r_k, ret2) { + BTREE_ITER_slots, r_k, ret2) { if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) break; diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 1a331e539204..469037929685 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -71,6 +71,12 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, } } +static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_rcu(c, dev); + return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; +} + /* * returns true if p1 is better than p2: */ @@ -79,11 +85,8 @@ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p2) { if (likely(!p1.idx && !p2.idx)) { - struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); - struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); - - u64 l1 = atomic64_read(&dev1->cur_latency[READ]); - u64 l2 = atomic64_read(&dev2->cur_latency[READ]); + u64 l1 = dev_latency(c, p1.ptr.dev); + u64 l2 = dev_latency(c, p2.ptr.dev); /* Pick at random, biased in favor of the faster device: */ @@ -109,21 +112,21 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, const union bch_extent_entry *entry; struct extent_ptr_decoded p; struct bch_dev_io_failures *f; - struct bch_dev *ca; int ret = 0; if (k.k->type == KEY_TYPE_error) return -EIO; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ - if (p.ptr.unwritten) - return 0; - - ca = bch_dev_bkey_exists(c, p.ptr.dev); + if (p.ptr.unwritten) { + ret = 0; + break; + } /* * If there are any dirty pointers it's an error if we can't @@ -132,7 +135,9 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (!ret && !p.ptr.cached) ret = -EIO; - if (p.ptr.cached && ptr_stale(ca, &p.ptr)) + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + + if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr))) continue; f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; @@ -141,12 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ? f->idx : f->idx + 1; - if (!p.idx && - !bch2_dev_is_readable(ca)) + if (!p.idx && !ca) p.idx++; - if (bch2_force_reconstruct_read && - !p.idx && p.has_ec) + if (!p.idx && p.has_ec && bch2_force_reconstruct_read) + p.idx++; + + if (!p.idx && !bch2_dev_is_readable(ca)) p.idx++; if (p.idx >= (unsigned) p.has_ec + 1) @@ -158,6 +164,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, *pick = p; ret = 1; } + rcu_read_unlock(); return ret; } @@ -165,7 +172,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -186,7 +193,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); @@ -201,6 +208,11 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, c, err, btree_ptr_v2_min_key_bad, "min_key > key"); + if (flags & BCH_VALIDATE_write) + bkey_fsck_err_on(!bp.v->sectors_written, + c, err, btree_ptr_v2_written_0, + "sectors_written == 0"); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); fsck_err: return ret; @@ -247,7 +259,6 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) const union bch_extent_entry *en_r; struct extent_ptr_decoded lp, rp; bool use_right_ptr; - struct bch_dev *ca; en_l = l_ptrs.start; en_r = r_ptrs.start; @@ -278,8 +289,12 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return false; /* Extents may not straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp.ptr.dev); - if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); + bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); + rcu_read_unlock(); + + if (!same_bucket) return false; if (lp.has_ec != rp.has_ec || @@ -385,7 +400,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); @@ -667,16 +682,16 @@ static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - return __extent_ptr_durability(ca, p); + return ca ? __extent_ptr_durability(ca, p) : 0; } unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - if (ca->mi.state == BCH_MEMBER_STATE_failed) + if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) return 0; return __extent_ptr_durability(ca, p); @@ -689,8 +704,10 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) durability += bch2_extent_ptr_durability(c, &p); + rcu_read_unlock(); return durability; } @@ -702,9 +719,11 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) durability += bch2_extent_ptr_durability(c, &p); + rcu_read_unlock(); return durability; } @@ -833,8 +852,6 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) { - struct bch_extent_ptr *ptr; - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } @@ -860,14 +877,21 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_dev *ca; + bool ret = false; + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && + (ca = bch2_dev_rcu(c, ptr->dev)) && (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return true; + !dev_ptr_stale_rcu(ca, ptr))) { + ret = true; + break; + } + rcu_read_unlock(); - return false; + return ret; } bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, @@ -969,21 +993,23 @@ void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) */ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { - struct bch_extent_ptr *ptr; + struct bch_dev *ca; + rcu_read_lock(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && - ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + (ca = bch2_dev_rcu(c, ptr->dev)) && + dev_ptr_stale_rcu(ca, ptr)); + rcu_read_unlock(); return bkey_deleted(k.k); } void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) { - struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; - + out->atomic++; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); if (!ca) { prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, @@ -998,11 +1024,11 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (b >= ca->mi.first_bucket && - b < ca->mi.nbuckets && - ptr_stale(ca, ptr)) + if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr)) prt_printf(out, " stale"); } + rcu_read_unlock(); + --out->atomic; } void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, @@ -1069,55 +1095,50 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, static int extent_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata, struct printbuf *err) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - u64 bucket; - u32 bucket_offset; - struct bch_dev *ca; int ret = 0; - if (!bch2_dev_exists2(c, ptr->dev)) { - /* - * If we're in the write path this key might have already been - * overwritten, and we could be seeing a device that doesn't - * exist anymore due to racing with device removal: - */ - if (flags & BKEY_INVALID_WRITE) - return 0; - - bkey_fsck_err(c, err, ptr_to_invalid_device, - "pointer to invalid device (%u)", ptr->dev); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) { + rcu_read_unlock(); + return 0; } + u32 bucket_offset; + u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + unsigned first_bucket = ca->mi.first_bucket; + u64 nbuckets = ca->mi.nbuckets; + unsigned bucket_size = ca->mi.bucket_size; + rcu_read_unlock(); - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr2) bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, ptr_to_duplicate_device, "multiple pointers to same device (%u)", ptr->dev); - bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, + bkey_fsck_err_on(bucket >= nbuckets, c, err, ptr_after_last_bucket, - "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); - bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, + "pointer past last bucket (%llu > %llu)", bucket, nbuckets); + bkey_fsck_err_on(bucket < first_bucket, c, err, ptr_before_first_bucket, - "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, + "pointer before first bucket (%llu < %u)", bucket, first_bucket); + bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err, ptr_spans_multiple_buckets, "pointer spans multiple buckets (%u + %u > %u)", - bucket_offset, size_ondisk, ca->mi.bucket_size); + bucket_offset, size_ondisk, bucket_size); fsck_err: return ret; } int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -1193,7 +1214,7 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(crc_is_encoded(crc) && (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err, + (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err, ptr_crc_uncompressed_size_too_big, "too large encoded extent"); diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 528e817eacbd..1ade959652b2 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -8,7 +8,7 @@ struct bch_fs; struct btree_trans; -enum bkey_invalid_flags; +enum bch_validate_flags; /* extent entries: */ @@ -406,12 +406,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); @@ -448,7 +448,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -654,7 +654,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, do { \ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ \ - _ptr = &_ptrs.start->ptr; \ + struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \ \ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ if (_cond) { \ @@ -680,7 +680,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_ptr_swab(struct bkey_s); diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 0f955c3c76a7..2eaffe37b5e7 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -171,7 +171,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, swap_r_func_t swap_func, const void *priv) { - int i, c, r; + int i, j, k; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) @@ -188,17 +188,22 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, /* heapify */ for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; + /* Find the sift-down path all the way to the leaves. */ + for (j = i; k = j * 2 + 1, k + 1 < n;) + j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - if (c + 1 < n && - eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) - c++; + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == n) + j = j * 2 + 1; - if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) - break; + /* Backtrack to the correct location. */ + while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) + j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + /* Shift the element into its correct place. */ + for (k = j; j != i;) { + j = (j - 1) / 2; + eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); } } @@ -206,17 +211,22 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, for (i = n - 1; i > 0; --i) { eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; + /* Find the sift-down path all the way to the leaves. */ + for (j = 0; k = j * 2 + 1, k + 1 < i;) + j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - if (c + 1 < i && - eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) - c++; + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == i) + j = j * 2 + 1; - if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) - break; + /* Backtrack to the correct location. */ + while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) + j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + /* Shift the element into its correct place. */ + for (k = j; j;) { + j = (j - 1) / 2; + eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); } } } @@ -232,3 +242,64 @@ void eytzinger0_sort(void *base, size_t n, size_t size, return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); } + +#if 0 +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/ktime.h> + +static u64 cmp_count; + +static int mycmp(const void *a, const void *b) +{ + u32 _a = *(u32 *)a; + u32 _b = *(u32 *)b; + + cmp_count++; + if (_a < _b) + return -1; + else if (_a > _b) + return 1; + else + return 0; +} + +static int test(void) +{ + size_t N, i; + ktime_t start, end; + s64 delta; + u32 *arr; + + for (N = 10000; N <= 100000; N += 10000) { + arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL); + cmp_count = 0; + + for (i = 0; i < N; i++) + arr[i] = get_random_u32(); + + start = ktime_get(); + eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL); + end = ktime_get(); + + delta = ktime_us_delta(end, start); + printk(KERN_INFO "time: %lld\n", delta); + printk(KERN_INFO "comparisons: %lld\n", cmp_count); + + u32 prev = 0; + + eytzinger0_for_each(i, N) { + if (prev > arr[i]) + goto err; + prev = arr[i]; + } + + kfree(arr); + } + return 0; + +err: + kfree(arr); + return -1; +} +#endif diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 624e6f963240..508d029ac53d 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -42,7 +42,7 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; @@ -70,7 +70,7 @@ int bch2_create_trans(struct btree_trans *trans, struct bch_subvolume s; ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, - BTREE_ITER_CACHED, &s); + BTREE_ITER_cached, &s); if (ret) goto err; @@ -78,7 +78,7 @@ int bch2_create_trans(struct btree_trans *trans, } ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -163,7 +163,7 @@ int bch2_create_trans(struct btree_trans *trans, name, dir_target, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) goto err; @@ -171,7 +171,7 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } - inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + inode_iter.flags &= ~BTREE_ITER_all_snapshots; bch2_btree_iter_set_snapshot(&inode_iter, snapshot); ret = bch2_btree_iter_traverse(&inode_iter) ?: @@ -198,16 +198,16 @@ int bch2_link_trans(struct btree_trans *trans, if (dir.subvol != inum.subvol) return -EXDEV; - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); if (ret) - goto err; + return ret; inode_u->bi_ctime = now; ret = bch2_inode_nlink_inc(inode_u); if (ret) - return ret; + goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; @@ -223,7 +223,7 @@ int bch2_link_trans(struct btree_trans *trans, ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), name, inum.inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) goto err; @@ -255,19 +255,19 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; dir_hash = bch2_hash_info_init(c, dir_u); ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_INTENT); + name, &inum, BTREE_ITER_intent); if (ret) goto err; ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -322,7 +322,7 @@ int bch2_unlink_trans(struct btree_trans *trans, ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash, &dirent_iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); err: @@ -363,7 +363,7 @@ static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_p struct bkey_i_subvolume *s = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); int ret = PTR_ERR_OR_ZERO(s); if (ret) return ret; @@ -394,7 +394,7 @@ int bch2_rename_trans(struct btree_trans *trans, int ret; ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -403,7 +403,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (dst_dir.inum != src_dir.inum || dst_dir.subvol != src_dir.subvol) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -423,13 +423,13 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; if (dst_inum.inum) { ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; } diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 39292e7ef342..b0a33fabadf8 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -30,15 +30,8 @@ static void bch2_readpages_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - if (!bio->bi_status) { - folio_mark_uptodate(fi.folio); - } else { - folio_clear_uptodate(fi.folio); - folio_set_error(fi.folio); - } - folio_unlock(fi.folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK); bio_put(bio); } @@ -176,7 +169,7 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; @@ -408,7 +401,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op) bio_for_each_folio_all(fi, bio) { struct bch_folio *s; - folio_set_error(fi.folio); mapping_set_error(fi.folio->mapping, -EIO); s = __bch2_folio(fi.folio); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index b889370a5088..09d21aef879a 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -254,7 +254,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, err) { + BTREE_ITER_slots, k, err) { if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) break; diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index d359aa9b33b8..872283e5bd1e 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -214,7 +214,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_slots, k, ret) { unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = bkey_to_sector_state(k); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 20b40477425f..ef20b64033e0 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -202,7 +202,10 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) goto out; ret = bch2_flush_inode(c, inode); out: - return bch2_err_class(ret); + ret = bch2_err_class(ret); + if (ret == -EROFS) + ret = -EIO; + return ret; } /* truncate: */ @@ -594,7 +597,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); while (!ret && bkey_lt(iter.pos, end_pos)) { s64 i_sectors_delta = 0; @@ -1009,7 +1012,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inode->v.i_ino, offset >> 9, snapshot), - BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_slots, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_seek_pagecache_hole(&inode->v, offset, MAX_LFS_FILESIZE, 0, false); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 3dc8630ff9fe..205a323ffc6d 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -548,7 +548,7 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { - case FS_IOC_GETFLAGS: + case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 65b04b3c2679..fd851f10d11c 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -90,7 +90,7 @@ retry: bch2_trans_begin(trans); ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT) ?: + BTREE_ITER_intent) ?: (set ? set(trans, inode, &inode_u, p) : 0) ?: bch2_inode_write(trans, &iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); @@ -213,19 +213,43 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino _ret; \ }) +static struct inode *bch2_alloc_inode(struct super_block *sb) +{ + BUG(); +} + +static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) +{ + struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); + if (!inode) + return NULL; + + inode_init_once(&inode->v); + mutex_init(&inode->ei_update_lock); + two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + mutex_init(&inode->ei_quota_lock); + inode->v.i_state = 0; + + if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { + kmem_cache_free(bch2_inode_cache, inode); + return NULL; + } + + return inode; +} + /* * Allocate a new inode, dropping/retaking btree locks if necessary: */ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - struct bch_inode_info *inode = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, - to_bch_ei(new_inode(c->vfs_sb))); + __bch2_new_inode(trans->c)); if (unlikely(!inode)) { - int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM); + int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM); if (ret && inode) { __destroy_inode(&inode->v); kmem_cache_free(bch2_inode_cache, inode); @@ -290,7 +314,7 @@ __bch2_create(struct mnt_idmap *idmap, if (ret) return ERR_PTR(ret); #endif - inode = to_bch_ei(new_inode(c->vfs_sb)); + inode = __bch2_new_inode(c); if (unlikely(!inode)) { inode = ERR_PTR(-ENOMEM); goto err; @@ -323,7 +347,7 @@ retry: inum.inum = inode_u.bi_inum; ret = bch2_subvolume_get(trans, inum.subvol, true, - BTREE_ITER_WITH_UPDATES, &subvol) ?: + BTREE_ITER_with_updates, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, @@ -376,17 +400,14 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter dirent_iter = {}; subvol_inum inum = {}; + struct printbuf buf = PRINTBUF; - int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - dir_hash_info, dir, name, 0); + struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, + dir_hash_info, dir, name, 0); + int ret = bkey_err(k); if (ret) return ERR_PTR(ret); - struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter); - ret = bkey_err(k); - if (ret) - goto err; - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); if (ret > 0) ret = -ENOENT; @@ -406,20 +427,31 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); - if (bch2_err_matches(ret, ENOENT)) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - bch_err(c, "%s points to missing inode", buf.buf); - printbuf_exit(&buf); - } + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + c, "dirent to missing inode:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); if (ret) goto err; + /* regular files may have hardlinks: */ + if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && + !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), + c, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, &inode_u), + buf.buf))) { + ret = -ENOENT; + goto err; + } + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); inode = bch2_inode_insert(c, inode); out: bch2_trans_iter_exit(trans, &dirent_iter); + printbuf_exit(&buf); return inode; err: inode = ERR_PTR(ret); @@ -787,7 +819,7 @@ retry: acl = NULL; ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto btree_err; @@ -1043,6 +1075,10 @@ retry: bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); + + ret = bch2_trans_relock(trans); + if (ret) + break; } start = iter.pos.offset; bch2_trans_iter_exit(trans, &iter); @@ -1490,34 +1526,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, mapping_set_large_folios(inode->v.i_mapping); } -static struct inode *bch2_alloc_inode(struct super_block *sb) -{ - struct bch_inode_info *inode; - - inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); - if (!inode) - return NULL; - - inode_init_once(&inode->v); - mutex_init(&inode->ei_update_lock); - two_state_lock_init(&inode->ei_pagecache_lock); - INIT_LIST_HEAD(&inode->ei_vfs_inode_list); - mutex_init(&inode->ei_quota_lock); - - return &inode->v; -} - -static void bch2_i_callback(struct rcu_head *head) -{ - struct inode *vinode = container_of(head, struct inode, i_rcu); - struct bch_inode_info *inode = to_bch_ei(vinode); - - kmem_cache_free(bch2_inode_cache, inode); -} - -static void bch2_destroy_inode(struct inode *vinode) +static void bch2_free_inode(struct inode *vinode) { - call_rcu(&vinode->i_rcu, bch2_i_callback); + kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); } static int inode_update_times_fn(struct btree_trans *trans, @@ -1825,7 +1836,7 @@ static int bch2_unfreeze(struct super_block *sb) static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, - .destroy_inode = bch2_destroy_inode, + .free_inode = bch2_free_inode, .write_inode = bch2_vfs_write_inode, .evict_inode = bch2_evict_inode, .sync_fs = bch2_sync_fs, diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 8e2010212cc3..c8f57465131c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -79,7 +79,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_all_snapshots); k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) @@ -127,13 +127,13 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0, snapshot); + struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); + int ret = bkey_err(k); if (ret) return ret; - d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); *target = le64_to_cpu(d.v->d_inum); *type = d.v->d_type; bch2_trans_iter_exit(trans, &iter); @@ -154,12 +154,12 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) dir_hash_info = bch2_hash_info_init(c, &dir_inode); - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash_info, &iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); err: bch_err_fn(c, ret); @@ -274,9 +274,9 @@ create_lostfound: &lostfound_str, lostfound->bi_inum, &lostfound->bi_dir_offset, - BCH_HASH_SET_MUST_CREATE) ?: + STR_HASH_must_create) ?: bch2_inode_write_flags(trans, &lostfound_iter, lostfound, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); err: bch_err_msg(c, ret, "creating lost+found"); bch2_trans_iter_exit(trans, &lostfound_iter); @@ -333,7 +333,7 @@ static int reattach_inode(struct btree_trans *trans, &name, inode->bi_subvol ?: inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) return ret; @@ -486,14 +486,9 @@ static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 in return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG); } -struct snapshots_seen_entry { - u32 id; - u32 equiv; -}; - struct snapshots_seen { struct bpos pos; - DARRAY(struct snapshots_seen_entry) ids; + snapshot_id_list ids; }; static inline void snapshots_seen_exit(struct snapshots_seen *s) @@ -508,20 +503,15 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) { - struct snapshots_seen_entry *i, n = { - .id = id, - .equiv = bch2_snapshot_equiv(c, id), - }; - int ret = 0; - + u32 *i; __darray_for_each(s->ids, i) { - if (i->id == id) + if (*i == id) return 0; - if (i->id > id) + if (*i > id) break; } - ret = darray_insert_item(&s->ids, i - s->ids.data, n); + int ret = darray_insert_item(&s->ids, i - s->ids.data, id); if (ret) bch_err(c, "error reallocating snapshots_seen table (size %zu)", s->ids.size); @@ -531,42 +521,11 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, enum btree_id btree_id, struct bpos pos) { - struct snapshots_seen_entry n = { - .id = pos.snapshot, - .equiv = bch2_snapshot_equiv(c, pos.snapshot), - }; - int ret = 0; - if (!bkey_eq(s->pos, pos)) s->ids.nr = 0; - s->pos = pos; - s->pos.snapshot = n.equiv; - darray_for_each(s->ids, i) { - if (i->id == n.id) - return 0; - - /* - * We currently don't rigorously track for snapshot cleanup - * needing to be run, so it shouldn't be a fsck error yet: - */ - if (i->equiv == n.equiv) { - bch_err(c, "snapshot deletion did not finish:\n" - " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", - bch2_btree_id_str(btree_id), - pos.inode, pos.offset, - i->id, n.id, n.equiv); - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); - } - } - - ret = darray_push(&s->ids, n); - if (ret) - bch_err(c, "error reallocating snapshots_seen table (size %zu)", - s->ids.size); - return ret; + return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); } /** @@ -586,12 +545,10 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see ssize_t i; EBUG_ON(id > ancestor); - EBUG_ON(!bch2_snapshot_is_equiv(c, id)); - EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); /* @ancestor should be the snapshot most recently added to @seen */ EBUG_ON(ancestor != seen->pos.snapshot); - EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv); + EBUG_ON(ancestor != darray_last(seen->ids)); if (id == ancestor) return true; @@ -610,9 +567,9 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see */ for (i = seen->ids.nr - 2; - i >= 0 && seen->ids.data[i].equiv >= id; + i >= 0 && seen->ids.data[i] >= id; --i) - if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv)) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) return false; return true; @@ -643,9 +600,6 @@ static int ref_visible2(struct bch_fs *c, u32 src, struct snapshots_seen *src_seen, u32 dst, struct snapshots_seen *dst_seen) { - src = bch2_snapshot_equiv(c, src); - dst = bch2_snapshot_equiv(c, dst); - if (dst > src) { swap(dst, src); swap(dst_seen, src_seen); @@ -692,7 +646,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, return darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, - .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), + .snapshot = inode.k->p.snapshot, })); } @@ -708,7 +662,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, w->inodes.nr = 0; for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + BTREE_ITER_all_snapshots, k, ret) { if (k.k->p.offset != inum) break; @@ -728,21 +682,20 @@ static struct inode_walker_entry * lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) { bool is_whiteout = k.k->type == KEY_TYPE_whiteout; - u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); struct inode_walker_entry *i; __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) goto found; return NULL; found: - BUG_ON(snapshot > i->snapshot); + BUG_ON(k.k->p.snapshot > i->snapshot); - if (snapshot != i->snapshot && !is_whiteout) { + if (k.k->p.snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - new.snapshot = snapshot; + new.snapshot = k.k->p.snapshot; new.count = 0; struct printbuf buf = PRINTBUF; @@ -751,10 +704,10 @@ found: bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" "unexpected because we should always update the inode when we update a key in that inode\n" "%s", - w->last_pos.inode, snapshot, i->snapshot, buf.buf); + w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); printbuf_exit(&buf); - while (i > w->inodes.data && i[-1].snapshot > snapshot) + while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) --i; size_t pos = i - w->inodes.data; @@ -786,10 +739,10 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, return lookup_inode_for_snapshot(trans->c, w, k); } -static int __get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) +static int get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -799,19 +752,17 @@ static int __get_visible_inodes(struct btree_trans *trans, w->inodes.nr = 0; for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - + BTREE_ITER_all_snapshots, k, ret) { if (k.k->p.offset != inum) break; - if (!ref_visible(c, s, s->pos.snapshot, equiv)) + if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) continue; if (bkey_is_inode(k.k)) add_inode(c, w, k); - if (equiv >= s->pos.snapshot) + if (k.k->p.snapshot >= s->pos.snapshot) break; } bch2_trans_iter_exit(trans, &iter); @@ -832,7 +783,7 @@ static int check_key_has_snapshot(struct btree_trans *trans, "key in missing snapshot: %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; + BTREE_UPDATE_internal_snapshot_node) ?: 1; fsck_err: printbuf_exit(&buf); return ret; @@ -861,8 +812,8 @@ static int hash_redo_key(struct btree_trans *trans, bch2_hash_set_in_snapshot(trans, desc, hash_info, (subvol_inum) { 0, k.k->p.inode }, k.k->p.snapshot, tmp, - BCH_HASH_SET_MUST_CREATE, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + STR_HASH_must_create| + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } @@ -891,7 +842,7 @@ static int hash_check_key(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, desc.btree_id, SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_slots, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; @@ -1233,7 +1184,7 @@ int bch2_check_inodes(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_inode(trans, &iter, k, &prev, &s, full))); @@ -1362,8 +1313,8 @@ static int overlapping_extents_found(struct btree_trans *trans, BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOT_EXTENTS); + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents); k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) @@ -1425,7 +1376,7 @@ static int overlapping_extents_found(struct btree_trans *trans, trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); ret = bch2_trans_update_extent_overwrite(trans, old_iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + BTREE_UPDATE_internal_snapshot_node, k1, k2) ?: bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); @@ -1466,7 +1417,6 @@ static int check_overlapping_extents(struct btree_trans *trans, struct snapshots_seen *seen, struct extent_ends *extent_ends, struct bkey_s_c k, - u32 equiv, struct btree_iter *iter, bool *fixed) { @@ -1535,11 +1485,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bch_fs *c = trans->c; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; - struct bpos equiv = k.k->p; int ret = 0; - equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); - ret = check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; @@ -1589,8 +1536,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_val_to_text(&buf, c, k), buf.buf))) goto delete; - ret = check_overlapping_extents(trans, s, extent_ends, k, - equiv.snapshot, iter, + ret = check_overlapping_extents(trans, s, extent_ends, k, iter, &inode->recalculate_sums); if (ret) goto err; @@ -1607,8 +1553,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (; inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > equiv.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) + if (i->snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) continue; if (k.k->type != KEY_TYPE_whiteout) { @@ -1625,7 +1571,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_btree_iter_set_snapshot(&iter2, i->snapshot); ret = bch2_btree_iter_traverse(&iter2) ?: bch2_btree_delete_at(trans, &iter2, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter2); if (ret) goto err; @@ -1652,7 +1598,7 @@ fsck_err: bch_err_fn(c, ret); return ret; delete: - ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); goto out; } @@ -1673,7 +1619,7 @@ int bch2_check_extents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_disk_reservation_put(c, &res); @@ -1698,7 +1644,7 @@ int bch2_check_indirect_extents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_disk_reservation_put(c, &res); @@ -1767,6 +1713,15 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, if (inode_points_to_dirent(target, d)) return 0; + if (bch2_inode_should_have_bp(target) && + !fsck_err(c, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto out_noiter; + if (!target->bi_dir && !target->bi_dir_offset) { target->bi_dir = d.k->p.inode; @@ -1835,6 +1790,7 @@ out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); +out_noiter: printbuf_exit(&buf); bch_err_fn(c, ret); return ret; @@ -2052,7 +2008,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_fs *c = trans->c; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; - struct bpos equiv; int ret = 0; ret = check_key_has_snapshot(trans, iter, k); @@ -2061,9 +2016,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto out; } - equiv = k.k->p; - equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); if (ret) goto err; @@ -2104,7 +2056,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); goto out; } @@ -2140,14 +2092,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } else { - ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); if (ret) goto err; if (fsck_err_on(!target->inodes.nr, c, dirent_to_missing_inode, - "dirent points to missing inode: (equiv %u)\n%s", - equiv.snapshot, + "dirent points to missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -2164,7 +2115,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, } if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, equiv.snapshot, i) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) i->count++; } out: @@ -2191,7 +2142,7 @@ int bch2_check_dirents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -2255,7 +2206,7 @@ int bch2_check_xattrs(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -2422,7 +2373,7 @@ int bch2_check_subvolume_structure(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol_path(trans, &iter, k))); bch_err_fn(c, ret); @@ -2457,7 +2408,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode; struct printbuf buf = PRINTBUF; - u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot); + u32 snapshot = inode_k.k->p.snapshot; int ret = 0; p->nr = 0; @@ -2559,9 +2510,9 @@ int bch2_check_directory_structure(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ if (!bkey_is_inode(k.k)) continue; @@ -2661,9 +2612,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ if (!bkey_is_inode(k.k)) continue; @@ -2704,9 +2655,9 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); if (ret) break; @@ -2717,8 +2668,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links if (d.v->d_type != DT_DIR && d.v->d_type != DT_SUBVOL) inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), - bch2_snapshot_equiv(c, d.k->p.snapshot)); + le64_to_cpu(d.v->d_inum), d.k->p.snapshot); } 0; }))); @@ -2781,7 +2731,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS(0, range_start), - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); if (ret < 0) { @@ -2849,7 +2799,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, u->v.front_pad = 0; u->v.back_pad = 0; - return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); } int bch2_fix_reflink_p(struct bch_fs *c) @@ -2860,8 +2810,8 @@ int bch2_fix_reflink_p(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent|BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, fix_reflink_p_key(trans, &iter, k))); bch_err_fn(c, ret); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 0f95d7fb5ec0..aafa79fa6351 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -339,7 +339,7 @@ int bch2_inode_peek_nowarn(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), - flags|BTREE_ITER_CACHED); + flags|BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -371,7 +371,7 @@ int bch2_inode_peek(struct btree_trans *trans, int bch2_inode_write_flags(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_inode_buf *inode_p; @@ -399,7 +399,7 @@ int __bch2_fsck_write_inode(struct btree_trans *trans, return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, &inode_p->inode.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } int bch2_fsck_write_inode(struct btree_trans *trans, @@ -473,7 +473,7 @@ fsck_err: } int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); @@ -490,7 +490,7 @@ fsck_err: } int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); @@ -507,7 +507,7 @@ fsck_err: } int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); @@ -535,29 +535,19 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { printbuf_indent_add(out, 2); - prt_printf(out, "mode=%o", inode->bi_mode); - prt_newline(out); + prt_printf(out, "mode=%o\n", inode->bi_mode); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); - prt_printf(out, " (%x)", inode->bi_flags); - prt_newline(out); + prt_printf(out, " (%x)\n", inode->bi_flags); - prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq); - prt_newline(out); - - prt_printf(out, "bi_size=%llu", inode->bi_size); - prt_newline(out); - - prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); - prt_newline(out); - - prt_printf(out, "bi_version=%llu", inode->bi_version); - prt_newline(out); + prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); + prt_printf(out, "bi_size=%llu\n", inode->bi_size); + prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); + prt_printf(out, "bi_version=%llu\n", inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, #_name "=%llu", (u64) inode->_name); \ - prt_newline(out); + prt_printf(out, #_name "=%llu\n", (u64) inode->_name); BCH_INODE_FIELDS_v3() #undef x printbuf_indent_sub(out, 2); @@ -604,11 +594,11 @@ int bch2_trigger_inode(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { if (nr) { int ret = bch2_replicas_deltas_realloc(trans, 0); if (ret) @@ -627,13 +617,13 @@ int bch2_trigger_inode(struct btree_trans *trans, } } - if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct bch_fs *c = trans->c; percpu_down_read(&c->mark_lock); @@ -645,7 +635,7 @@ int bch2_trigger_inode(struct btree_trans *trans, } int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -762,8 +752,8 @@ int bch2_inode_create(struct btree_trans *trans, pos = start; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -824,7 +814,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, * extent iterator: */ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_intent); while (1) { bch2_trans_begin(trans); @@ -846,7 +836,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bkey_init(&delete.k); delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) + if (iter.flags & BTREE_ITER_is_extents) bch2_key_resize(&delete.k, bpos_min(end, k.k->p).offset - iter.pos.offset); @@ -895,7 +885,7 @@ retry: k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), - BTREE_ITER_INTENT|BTREE_ITER_CACHED); + BTREE_ITER_intent|BTREE_ITER_cached); ret = bkey_err(k); if (ret) goto err; @@ -1055,7 +1045,7 @@ retry: bch2_trans_begin(trans); k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + SPOS(0, inum, snapshot), BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1100,7 +1090,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bch_inode_unpacked inode; int ret; - k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); + k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -1152,7 +1142,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, inode.bi_flags &= ~BCH_INODE_unlinked; ret = bch2_inode_write_flags(trans, &inode_iter, &inode, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch_err_msg(c, ret, "clearing inode unlinked flag"); if (ret) goto out; @@ -1199,7 +1189,7 @@ again: * flushed and we'd spin: */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); if (ret > 0) { diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 056298050550..679f5f5e5d15 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -6,19 +6,20 @@ #include "bkey_methods.h" #include "opts.h" -enum bkey_invalid_flags; +enum bch_validate_flags; extern const char * const bch2_inode_opts[]; int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ @@ -49,7 +50,7 @@ static inline bool bkey_is_inode(const struct bkey *k) } int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ @@ -101,7 +102,7 @@ int bch2_inode_peek(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, enum btree_update_flags); + struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); static inline int bch2_inode_write(struct btree_trans *trans, struct btree_iter *iter, @@ -220,6 +221,14 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); +static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode) +{ + bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; + + return S_ISDIR(inode->bi_mode) || + (!inode->bi_nlink && inode_has_bp); +} + struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 82f9170dab3f..4ec979b4b23e 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -198,7 +198,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, start), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); @@ -230,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans, struct bch_inode_unpacked inode_u; int ret; - ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent) ?: (inode_u.bi_size = new_i_size, 0) ?: bch2_inode_write(trans, &iter, &inode_u); @@ -256,7 +256,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); bch2_trans_iter_exit(trans, &fpunch_iter); @@ -317,7 +317,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset offset <<= 9; len <<= 9; - ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent); if (ret) return ret; @@ -365,7 +365,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_intent); switch (op->v.state) { case LOGGED_OP_FINSERT_start: diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 8a556e6d1ab6..f57486794484 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -378,7 +378,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); + rbio->read_pos, BTREE_ITER_slots); retry: rbio->bio.bi_status = 0; @@ -487,7 +487,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, return 0; k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); if ((ret = bkey_err(k))) goto out; @@ -523,7 +523,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, goto out; ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); out: bch2_trans_iter_exit(trans, &iter); return ret; @@ -541,7 +541,6 @@ static void __bch2_read_endio(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); struct bio *src = &rbio->bio; struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; @@ -647,13 +646,15 @@ csum_err: prt_str(&buf, "data "); bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "data %s", buf.buf); + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) { + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "data %s", buf.buf); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + } printbuf_exit(&buf); - - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; decompression_err: @@ -675,7 +676,7 @@ static void bch2_read_endio(struct bio *bio) struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; @@ -687,17 +688,21 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, - rbio->read_pos.inode, - rbio->read_pos.offset, - "data read error: %s", - bch2_blk_status_to_str(bio->bi_status))) { + if (bio->bi_status) { + if (ca) { + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status)); + bch2_io_error(ca, BCH_MEMBER_ERROR_read); + } bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr)) { + (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { trace_and_count(c, read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -758,22 +763,21 @@ err: } static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bch_dev *ca, struct bkey_s_c k, struct bch_extent_ptr ptr) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); struct btree_iter iter; struct printbuf buf = PRINTBUF; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(c, &ptr), - BTREE_ITER_CACHED); + PTR_BUCKET_POS(ca, &ptr), + BTREE_ITER_cached); - prt_printf(&buf, "Attempting to read from stale dirty pointer:"); + prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); printbuf_indent_add(&buf, 2); - prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); @@ -801,7 +805,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct bch_dev *ca = NULL; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); @@ -832,7 +835,7 @@ retry_pick: goto err; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); /* * Stale dirty pointers are treated as IO errors, but @failed isn't @@ -842,9 +845,11 @@ retry_pick: */ if ((flags & BCH_READ_IN_RETRY) && !pick.ptr.cached && - unlikely(ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, k, pick.ptr); + ca && + unlikely(dev_ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick); + percpu_ref_put(&ca->io_ref); goto retry_pick; } @@ -859,8 +864,11 @@ retry_pick: * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { + if (ca) + percpu_ref_put(&ca->io_ref); goto hole; + } iter.bi_size = pick.crc.compressed_size << 9; goto get_bio; @@ -965,7 +973,7 @@ get_bio: rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; - rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); + rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; rbio->hole = 0; rbio->retry = 0; @@ -995,7 +1003,7 @@ get_bio: * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); @@ -1113,7 +1121,7 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum.inum, bvec_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 40d7df7607df..9401d13e31bb 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -166,7 +166,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bch2_trans_copy_iter(&iter, extent_iter); for_each_btree_key_upto_continue_norestart(iter, - new->k.p, BTREE_ITER_SLOTS, old, ret) { + new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -210,14 +210,14 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, * to be journalled - if we crash, the bi_journal_seq update will be * lost, but that's fine. */ - unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; + unsigned inode_update_flags = BTREE_UPDATE_nojournal; struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_iter->pos.inode, extent_iter->snapshot), - BTREE_ITER_CACHED); + BTREE_ITER_cached); int ret = bkey_err(k); if (unlikely(ret)) return ret; @@ -259,7 +259,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, } ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_internal_snapshot_node| inode_update_flags); err: bch2_trans_iter_exit(trans, &iter); @@ -368,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, bkey_start_pos(&sk.k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: bch2_extent_update(trans, inum, &iter, sk.k, @@ -407,13 +407,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { - BUG_ON(!bch2_dev_exists2(c, ptr->dev)); - - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = nocow + ? bch2_dev_have_ref(c, ptr->dev) + : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); if (to_entry(ptr + 1) < ptrs.end) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, - GFP_NOFS, &ca->replica_set)); + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; @@ -430,11 +429,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = nocow || bch2_dev_get_ioref(ca, - type == BCH_DATA_btree ? READ : WRITE); + n->have_ioref = ca != NULL; n->nocow = nocow; n->submit_time = local_clock(); n->inode_offset = bkey_start_offset(&k->k); + if (nocow) + n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); n->bio.bi_iter.bi_sector = ptr->offset; if (likely(n->have_ioref)) { @@ -481,7 +481,6 @@ static void bch2_write_done(struct closure *cl) static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { struct keylist *keys = &op->insert_keys; - struct bch_extent_ptr *ptr; struct bkey_i *src, *dst = keys->keys, *n; for (src = keys->keys; src != keys->top; src = n) { @@ -650,7 +649,9 @@ static void bch2_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + struct bch_dev *ca = wbio->have_ioref + ? bch2_dev_have_ref(c, wbio->dev) + : NULL; if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, op->pos.inode, @@ -661,8 +662,12 @@ static void bch2_write_endio(struct bio *bio) op->flags |= BCH_WRITE_IO_ERROR; } - if (wbio->nocow) + if (wbio->nocow) { + bch2_bucket_nocow_unlock(&c->nocow_locks, + POS(ca->dev_idx, wbio->nocow_bucket), + BUCKET_NOCOW_LOCK_UPDATE); set_bit(wbio->dev, op->devs_need_flush->d); + } if (wbio->have_ioref) { bch2_latency_acct(ca, wbio->submit_time, WRITE); @@ -1101,30 +1106,21 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, return false; e = bkey_s_c_to_extent(k); + + rcu_read_lock(); extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) + if (crc_is_encoded(p.crc) || p.has_ec) { + rcu_read_unlock(); return false; + } replicas += bch2_extent_ptr_durability(c, &p); } + rcu_read_unlock(); return replicas >= op->opts.data_replicas; } -static inline void bch2_nocow_write_unlock(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - - for_each_keylist_key(&op->insert_keys, k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - - bkey_for_each_ptr(ptrs, ptr) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); - } -} - static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *orig, @@ -1158,7 +1154,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return bch2_extent_update_i_size_sectors(trans, iter, min(new->k.p.offset << 9, new_i_size), 0) ?: bch2_trans_update(trans, iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) @@ -1169,7 +1165,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) for_each_keylist_key(&op->insert_keys, orig) { int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); @@ -1195,8 +1191,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { - bch2_nocow_write_unlock(op); - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { op->error = -EIO; } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) @@ -1242,12 +1236,16 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(op->pos.inode, op->pos.offset, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { struct bio *bio = &op->wbio.bio; buckets.nr = 0; + ret = bch2_trans_relock(trans); + if (ret) + break; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -1267,14 +1265,15 @@ retry: /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - struct bpos b = PTR_BUCKET_POS(c, ptr); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + if (unlikely(!ca)) + goto err_get_ioref; + + struct bpos b = PTR_BUCKET_POS(ca, ptr); struct nocow_lock_bucket *l = bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); prefetch(l); - if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) - goto err_get_ioref; - /* XXX allocating memory with btree locks held - rare */ darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .gen = ptr->gen, .l = l, @@ -1293,7 +1292,7 @@ retry: bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode); + struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, bucket_to_u64(i->b), @@ -1370,7 +1369,7 @@ err: return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref); + percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); /* Fall back to COW path: */ goto out; @@ -1491,7 +1490,11 @@ err: if ((op->flags & BCH_WRITE_SYNC) || (!(op->flags & BCH_WRITE_DONE) && !(op->flags & BCH_WRITE_IN_WORKER))) { - closure_sync(&op->cl); + if (closure_sync_timeout(&op->cl, HZ * 10)) { + bch2_print_allocator_stuck(c); + closure_sync(&op->cl); + } + __bch2_write_index(op); if (!(op->flags & BCH_WRITE_DONE)) @@ -1649,8 +1652,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); - prt_newline(out); + prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); printbuf_indent_sub(out, 2); } @@ -1658,13 +1660,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) void bch2_fs_io_write_exit(struct bch_fs *c) { mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } int bch2_fs_io_write_init(struct bch_fs *c) { - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS)) + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || + bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) return -BCH_ERR_ENOMEM_bio_write_init; if (mempool_init_page_pool(&c->bio_bounce_pages, diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index c7f97c2c4805..6e878a6f2f0b 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -20,6 +20,7 @@ struct bch_write_bio { u64 submit_time; u64 inode_offset; + u64 nocow_bucket; struct bch_devs_list failed; u8 dev; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index a8b08e76d0d0..adec8e1ea73e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -53,29 +53,19 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 unsigned i = seq & JOURNAL_BUF_MASK; struct journal_buf *buf = j->buf + i; - prt_str(out, "seq:"); - prt_tab(out); - prt_printf(out, "%llu", seq); - prt_newline(out); + prt_printf(out, "seq:\t%llu\n", seq); printbuf_indent_add(out, 2); - prt_str(out, "refcount:"); - prt_tab(out); - prt_printf(out, "%u", journal_state_count(s, i)); - prt_newline(out); + prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); - prt_str(out, "size:"); - prt_tab(out); + prt_printf(out, "size:\t"); prt_human_readable_u64(out, vstruct_bytes(buf->data)); prt_newline(out); - prt_str(out, "expires:"); - prt_tab(out); - prt_printf(out, "%li jiffies", buf->expires - jiffies); - prt_newline(out); + prt_printf(out, "expires:\t"); + prt_printf(out, "%li jiffies\n", buf->expires - jiffies); - prt_str(out, "flags:"); - prt_tab(out); + prt_printf(out, "flags:\t"); if (buf->noflush) prt_str(out, "noflush "); if (buf->must_flush) @@ -87,9 +77,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 if (buf->write_started) prt_str(out, "write_started "); if (buf->write_allocated) - prt_str(out, "write allocated "); + prt_str(out, "write_allocated "); if (buf->write_done) - prt_str(out, "write done"); + prt_str(out, "write_done"); prt_newline(out); printbuf_indent_sub(out, 2); @@ -948,7 +938,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, break; } } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl); + ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, + BCH_DATA_journal, cl); ret = PTR_ERR_OR_ZERO(ob[nr_got]); if (ret) break; @@ -956,7 +947,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, - ca->mi.bucket_size)); + ca->mi.bucket_size, BTREE_TRIGGER_transactional)); if (ret) { bch2_open_bucket_put(c, ob[nr_got]); bch_err_msg(c, ret, "marking new journal buckets"); @@ -1036,7 +1027,8 @@ err_unblock: for (i = 0; i < nr_got; i++) bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, - bu[i], BCH_DATA_free, 0)); + bu[i], BCH_DATA_free, 0, + BTREE_TRIGGER_transactional)); err_free: if (!new_fs) for (i = 0; i < nr_got; i++) @@ -1187,12 +1179,14 @@ void bch2_fs_journal_stop(struct journal *j) bch2_journal_meta(j); journal_quiesce(j); + cancel_delayed_work_sync(&j->write_work); BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_REPLAY_DONE, &j->flags) && + test_bit(JOURNAL_replay_done, &j->flags) && j->last_empty_seq != journal_cur_seq(j)); - cancel_delayed_work_sync(&j->write_work); + if (!bch2_journal_error(j)) + clear_bit(JOURNAL_running, &j->flags); } int bch2_fs_journal_start(struct journal *j, u64 cur_seq) @@ -1266,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) spin_lock(&j->lock); - set_bit(JOURNAL_STARTED, &j->flags); + set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); @@ -1407,6 +1401,13 @@ int bch2_fs_journal_init(struct journal *j) /* debug: */ +static const char * const bch2_journal_flags_strs[] = { +#define x(n) #n, + JOURNAL_FLAGS() +#undef x + NULL +}; + void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -1415,19 +1416,22 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; if (!out->nr_tabstops) - printbuf_tabstop_push(out, 24); + printbuf_tabstop_push(out, 28); out->atomic++; rcu_read_lock(); s = READ_ONCE(j->reservations); + prt_printf(out, "flags:\t"); + prt_bitflags(out, bch2_journal_flags_strs, j->flags); + prt_newline(out); prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); - prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); - prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); + prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); + prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); @@ -1436,48 +1440,44 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_newline(out); prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); + prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "blocked:\t\t%u\n", j->blocked); + prt_printf(out, "blocked:\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); - prt_printf(out, "current entry:\t\t"); + prt_printf(out, "current entry:\t"); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: - prt_printf(out, "error"); + prt_printf(out, "error\n"); break; case JOURNAL_ENTRY_CLOSED_VAL: - prt_printf(out, "closed"); + prt_printf(out, "closed\n"); break; default: - prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); + prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; } - prt_newline(out); - prt_printf(out, "unwritten entries:"); - prt_newline(out); + prt_printf(out, "unwritten entries:\n"); bch2_journal_bufs_to_text(out, j); - prt_printf(out, - "replay done:\t\t%i\n", - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - prt_printf(out, "space:\n"); - prt_printf(out, "\tdiscarded\t%u:%u\n", + printbuf_indent_add(out, 2); + prt_printf(out, "discarded\t%u:%u\n", j->space[journal_space_discarded].next_entry, j->space[journal_space_discarded].total); - prt_printf(out, "\tclean ondisk\t%u:%u\n", + prt_printf(out, "clean ondisk\t%u:%u\n", j->space[journal_space_clean_ondisk].next_entry, j->space[journal_space_clean_ondisk].total); - prt_printf(out, "\tclean\t\t%u:%u\n", + prt_printf(out, "clean\t%u:%u\n", j->space[journal_space_clean].next_entry, j->space[journal_space_clean].total); - prt_printf(out, "\ttotal\t\t%u:%u\n", + prt_printf(out, "total\t%u:%u\n", j->space[journal_space_total].next_entry, j->space[journal_space_total].total); + printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -1488,14 +1488,16 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) if (!ja->nr) continue; - prt_printf(out, "dev %u:\n", ca->dev_idx); - prt_printf(out, "\tnr\t\t%u\n", ja->nr); - prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); - prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); - prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); - prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); - prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); - prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + prt_printf(out, "dev %u:\n", ca->dev_idx); + printbuf_indent_add(out, 2); + prt_printf(out, "nr\t%u\n", ja->nr); + prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); + prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); + prt_printf(out, "discard_idx\t%u\n", ja->discard_idx); + prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); + prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); + prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + printbuf_indent_sub(out, 2); } rcu_read_unlock(); @@ -1527,25 +1529,18 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 pin_list = journal_seq_pin(j, *seq); - prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); - prt_newline(out); + prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); printbuf_indent_add(out, 2); for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) - list_for_each_entry(pin, &pin_list->list[i], list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } + list_for_each_entry(pin, &pin_list->list[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); - if (!list_empty(&pin_list->flushed)) { - prt_printf(out, "flushed:"); - prt_newline(out); - } + if (!list_empty(&pin_list->flushed)) + prt_printf(out, "flushed:\n"); - list_for_each_entry(pin, &pin_list->flushed, list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } + list_for_each_entry(pin, &pin_list->flushed, list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); printbuf_indent_sub(out, 2); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 7c7528f839c5..fd1f7cdaa8bc 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -372,7 +372,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re int ret; EBUG_ON(res->ref); - EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + EBUG_ON(!test_bit(JOURNAL_running, &j->flags)); res->u64s = u64s; @@ -418,8 +418,8 @@ struct bch_dev; static inline void bch2_journal_set_replay_done(struct journal *j) { - BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); - set_bit(JOURNAL_REPLAY_DONE, &j->flags); + BUG_ON(!test_bit(JOURNAL_running, &j->flags)); + set_bit(JOURNAL_replay_done, &j->flags); } void bch2_journal_unblock(struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index eb1f9d6f5a19..cdcb1ad49af4 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,15 +17,38 @@ #include "sb-clean.h" #include "trace.h" +void bch2_journal_pos_from_member_info_set(struct bch_fs *c) +{ + lockdep_assert_held(&c->sb_lock); + + for_each_member_device(c, ca) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + + m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); + m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); + } +} + +void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + + unsigned idx = le32_to_cpu(m.last_journal_bucket); + if (idx < ca->journal.nr) + ca->journal.cur_idx = idx; + unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); + if (offset <= ca->mi.bucket_size) + ca->journal.sectors_free = ca->mi.bucket_size - offset; + } + mutex_unlock(&c->sb_lock); +} + void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) { darray_for_each(j->ptrs, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); - u64 offset; - - div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); - if (i != j->ptrs.data) prt_printf(out, " "); prt_printf(out, "%u:%u:%u (sector %llu)", @@ -122,6 +145,10 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; + if (!c->journal.oldest_seq_found_ondisk || + le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) + c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); + /* Is this entry older than the range we need? */ if (!c->opts.read_entire_journal && le64_to_cpu(j->seq) < jlist->last_seq) @@ -272,7 +299,7 @@ static void journal_entry_err_msg(struct printbuf *out, journal_entry_err_msg(&_buf, version, jset, entry); \ prt_printf(&_buf, msg, ##__VA_ARGS__); \ \ - switch (flags & BKEY_INVALID_WRITE) { \ + switch (flags & BCH_VALIDATE_write) { \ case READ: \ mustfix_fsck_err(c, _err, "%s", _buf.buf); \ break; \ @@ -301,9 +328,9 @@ static int journal_validate_key(struct bch_fs *c, unsigned level, enum btree_id btree_id, struct bkey_i *k, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { - int write = flags & BKEY_INVALID_WRITE; + int write = flags & BCH_VALIDATE_write; void *next = vstruct_next(entry); struct printbuf buf = PRINTBUF; int ret = 0; @@ -376,7 +403,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct bkey_i *k = entry->start; @@ -385,7 +412,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, entry->level, entry->btree_id, k, version, big_endian, - flags|BKEY_INVALID_JOURNAL); + flags|BCH_VALIDATE_journal); if (ret == FSCK_DELETED_KEY) continue; @@ -416,7 +443,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct bkey_i *k = entry->start; int ret = 0; @@ -455,7 +482,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { /* obsolete, don't care: */ return 0; @@ -470,7 +497,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { int ret = 0; @@ -497,7 +524,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; @@ -539,7 +566,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -573,7 +600,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -617,7 +644,7 @@ static int journal_entry_clock_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); @@ -657,13 +684,12 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); unsigned expected = sizeof(*u); - unsigned dev; int ret = 0; if (journal_entry_err_on(bytes < expected, @@ -675,16 +701,6 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, return ret; } - dev = le32_to_cpu(u->dev); - - if (journal_entry_err_on(!bch2_dev_exists2(c, dev), - c, version, jset, entry, - journal_entry_dev_usage_bad_dev, - "bad dev")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - if (journal_entry_err_on(u->pad, c, version, jset, entry, journal_entry_dev_usage_bad_pad, @@ -719,7 +735,7 @@ static int journal_entry_log_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return 0; } @@ -737,7 +753,7 @@ static int journal_entry_overwrite_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ); @@ -753,7 +769,7 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ); @@ -769,7 +785,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { unsigned bytes = vstruct_bytes(entry); unsigned expected = 16; @@ -799,7 +815,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs * struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bkey_invalid_flags); + enum bch_validate_flags); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -817,7 +833,7 @@ int bch2_journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return entry->type < BCH_JSET_ENTRY_NR ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, @@ -837,7 +853,7 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { unsigned version = le32_to_cpu(jset->version); int ret = 0; @@ -863,7 +879,7 @@ fsck_err: static int jset_validate(struct bch_fs *c, struct bch_dev *ca, struct jset *jset, u64 sector, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { unsigned version; int ret = 0; @@ -918,7 +934,7 @@ static int jset_validate_early(struct bch_fs *c, { size_t bytes = vstruct_bytes(jset); unsigned version; - enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + enum bch_validate_flags flags = BCH_VALIDATE_journal; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) @@ -1057,6 +1073,13 @@ reread: goto err; } + if (le64_to_cpu(j->seq) > ja->highest_seq_found) { + ja->highest_seq_found = le64_to_cpu(j->seq); + ja->cur_idx = bucket; + ja->sectors_free = ca->mi.bucket_size - + bucket_remainder(ca, offset) - sectors; + } + /* * This happens sometimes if we don't have discards on - * when we've partially overwritten a bucket with new @@ -1125,8 +1148,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) struct bch_fs *c = ca->fs; struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); - struct journal_replay *r, **_r; - struct genradix_iter iter; struct journal_read_buf buf = { NULL, 0 }; unsigned i; int ret = 0; @@ -1146,47 +1167,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) goto err; } - ja->sectors_free = ca->mi.bucket_size; - - mutex_lock(&jlist->lock); - genradix_for_each_reverse(&c->journal_entries, iter, _r) { - r = *_r; - - if (!r) - continue; - - darray_for_each(r->ptrs, i) - if (i->dev == ca->dev_idx) { - unsigned wrote = bucket_remainder(ca, i->sector) + - vstruct_sectors(&r->j, c->block_bits); - - ja->cur_idx = i->bucket; - ja->sectors_free = ca->mi.bucket_size - wrote; - goto found; - } - } -found: - mutex_unlock(&jlist->lock); - - if (ja->bucket_seq[ja->cur_idx] && - ja->sectors_free == ca->mi.bucket_size) { -#if 0 - /* - * Debug code for ZNS support, where we (probably) want to be - * correlated where we stopped in the journal to the zone write - * points: - */ - bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); - bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); - for (i = 0; i < 3; i++) { - unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; - - bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); - } -#endif - ja->sectors_free = 0; - } - /* * Set dirty_idx to indicate the entire journal is full and needs to be * reclaimed - journal reclaim will immediately reclaim whatever isn't @@ -1255,7 +1235,7 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + enum bch_validate_flags flags = BCH_VALIDATE_journal; i = *_i; @@ -1366,7 +1346,7 @@ int bch2_journal_read(struct bch_fs *c, fsck_err(c, journal_entries_missing, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" " prev at %s\n" - " next at %s", + " next at %s, continue?", missing_start, missing_end, *last_seq, *blacklist_seq - 1, buf1.buf, buf2.buf); @@ -1390,7 +1370,7 @@ int bch2_journal_read(struct bch_fs *c, continue; darray_for_each(i->ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); if (!ptr->csum_good) bch_err_dev_offset(ca, ptr->sector, @@ -1400,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c, } ret = jset_validate(c, - bch_dev_bkey_exists(c, i->ptrs.data[0].dev), + bch2_dev_have_ref(c, i->ptrs.data[0].dev), &i->j, i->ptrs.data[0].sector, READ); @@ -1731,10 +1711,8 @@ static CLOSURE_CALLBACK(journal_write_submit) unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct journal_device *ja = &ca->journal; - - if (!percpu_ref_tryget(&ca->io_ref)) { + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + if (!ca) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); continue; @@ -1743,6 +1721,7 @@ static CLOSURE_CALLBACK(journal_write_submit) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); + struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; @@ -1958,14 +1937,14 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * * So if we're in an error state, and we're still starting up, we don't * write anything at all. */ - if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) + if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) return -EIO; if (error || w->noflush || (!w->must_flush && (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + test_bit(JOURNAL_may_skip_flush, &j->flags))) { w->noflush = true; SET_JSET_NO_FLUSH(w->data, true); w->data->last_seq = 0; @@ -1976,7 +1955,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * w->must_flush = true; j->last_flush_write = jiffies; j->nr_flush_writes++; - clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + clear_bit(JOURNAL_need_flush_write, &j->flags); } return 0; diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 4f1e763ab506..2ca9cde30ea8 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -4,6 +4,9 @@ #include "darray.h" +void bch2_journal_pos_from_member_info_set(struct bch_fs *); +void bch2_journal_pos_from_member_info_resume(struct bch_fs *); + struct journal_ptr { bool csum_good; u8 dev; @@ -60,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bkey_invalid_flags); + enum bch_validate_flags); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 04a577848b01..79be0eaddfa0 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -67,7 +67,7 @@ void bch2_journal_set_watermark(struct journal *j) track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); - mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin); + mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin); swap(watermark, j->watermark); if (watermark > j->watermark) @@ -225,9 +225,9 @@ void bch2_journal_space_available(struct journal *j) j->space[journal_space_clean_ondisk].total) && (clean - clean_ondisk <= total / 8) && (clean_ondisk * 2 > clean)) - set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + set_bit(JOURNAL_may_skip_flush, &j->flags); else - clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + clear_bit(JOURNAL_may_skip_flush, &j->flags); bch2_journal_set_watermark(j); out: @@ -818,7 +818,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers */ - ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + ret = !test_bit(JOURNAL_replay_done, &j->flags) || journal_last_seq(j) > seq_to_flush || !fifo_used(&j->pin); @@ -833,7 +833,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) /* time_stats this */ bool did_work = false; - if (!test_bit(JOURNAL_STARTED, &j->flags)) + if (!test_bit(JOURNAL_running, &j->flags)) return false; closure_wait_event(&j->async_wait, diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index ae4fb8c3a2bc..db80e506e3ab 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -16,9 +16,8 @@ static int u64_cmp(const void *_l, const void *_r) return cmp_int(*l, *r); } -static int bch2_sb_journal_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal *journal = field_to_type(f, journal); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); @@ -99,9 +98,8 @@ static int u64_range_cmp(const void *_l, const void *_r) return cmp_int(l->start, r->start); } -static int bch2_sb_journal_v2_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 37a024e034d4..ed4846709611 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "btree_iter.h" #include "eytzinger.h" +#include "journal.h" #include "journal_seq_blacklist.h" #include "super-io.h" @@ -162,9 +162,8 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) return 0; } -static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); @@ -217,78 +216,40 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { .to_text = bch2_sb_journal_seq_blacklist_to_text }; -void bch2_blacklist_entries_gc(struct work_struct *work) +bool bch2_blacklist_entries_gc(struct bch_fs *c) { - struct bch_fs *c = container_of(work, struct bch_fs, - journal_seq_blacklist_gc_work); - struct journal_seq_blacklist_table *t; - struct bch_sb_field_journal_seq_blacklist *bl; struct journal_seq_blacklist_entry *src, *dst; - struct btree_trans *trans = bch2_trans_get(c); - unsigned i, nr, new_nr; - int ret; - - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_iter iter; - struct btree *b; - - bch2_trans_node_iter_init(trans, &iter, i, POS_MIN, - 0, 0, BTREE_ITER_PREFETCH); -retry: - bch2_trans_begin(trans); - - b = bch2_btree_iter_peek_node(&iter); - - while (!(ret = PTR_ERR_OR_ZERO(b)) && - b && - !test_bit(BCH_FS_stopping, &c->flags)) - b = bch2_btree_iter_next_node(&iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_iter_exit(trans, &iter); - } - - bch2_trans_put(trans); - if (ret) - return; - - mutex_lock(&c->sb_lock); - bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); if (!bl) - goto out; + return false; - nr = blacklist_nr_entries(bl); + unsigned nr = blacklist_nr_entries(bl); dst = bl->start; - t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; BUG_ON(nr != t->nr); + unsigned i; for (src = bl->start, i = eytzinger0_first(t->nr); src < bl->start + nr; src++, i = eytzinger0_next(i, nr)) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - if (t->entries[i].dirty) + if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) *dst++ = *src; } - new_nr = dst - bl->start; - - bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); - - if (new_nr != nr) { - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - new_nr ? sb_blacklist_u64s(new_nr) : 0); - BUG_ON(new_nr && !bl); + unsigned new_nr = dst - bl->start; + if (new_nr == nr) + return false; - if (!new_nr) - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); + bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr); - bch2_write_super(c); - } -out: - mutex_unlock(&c->sb_lock); + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + return true; } diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index afb886ec8e25..d47636f96fdc 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -17,6 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; -void bch2_blacklist_entries_gc(struct work_struct *); +bool bch2_blacklist_entries_gc(struct bch_fs *); #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index b5161b5d76a0..19183fcf7ad7 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -129,12 +129,17 @@ enum journal_space_from { journal_space_nr, }; +#define JOURNAL_FLAGS() \ + x(replay_done) \ + x(running) \ + x(may_skip_flush) \ + x(need_flush_write) \ + x(space_low) + enum journal_flags { - JOURNAL_REPLAY_DONE, - JOURNAL_STARTED, - JOURNAL_MAY_SKIP_FLUSH, - JOURNAL_NEED_FLUSH_WRITE, - JOURNAL_SPACE_LOW, +#define x(n) JOURNAL_##n, + JOURNAL_FLAGS() +#undef x }; /* Reasons we may fail to get a journal reservation: */ @@ -229,6 +234,7 @@ struct journal { u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; + u64 oldest_seq_found_ondisk; /* * FIFO of journal entries whose btree updates have not yet been @@ -326,6 +332,7 @@ struct journal_device { /* for bch_journal_read_device */ struct closure read; + u64 highest_seq_found; }; /* diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index b82f8209041f..f49fdca1d07d 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -56,7 +56,7 @@ int bch2_resume_logged_ops(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_logged_ops, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 26569043e368..a40d116224ed 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -11,7 +11,7 @@ /* KEY_TYPE_lru is obsolete: */ int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -149,7 +149,7 @@ int bch2_check_lrus(struct bch_fs *c) struct bpos last_flushed_pos = POS_MIN; int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); bch_err_fn(c, ret); diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 429dca816df5..fb11ab0dd00e 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -49,7 +49,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) } int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 69098eeb5d48..ddc187fb693d 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -49,7 +49,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, if (!bch2_bkey_has_device_c(k, dev_idx)) return 0; - n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node); ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; @@ -67,7 +67,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, /* * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * (BTREE_ITER_all_snapshots iterators aren't extent iterators), * we aren't using the extent overwrite path to delete, we're * just using the normal key deletion path: */ @@ -87,7 +87,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) continue; ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); if (ret) @@ -119,7 +119,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) for (id = 0; id < BTREE_ID_NR; id++) { bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); retry: ret = 0; while (bch2_trans_begin(trans), diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 4d94b7742dbb..8171f947fac8 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -41,28 +41,23 @@ static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c struct data_update_opts *data_opts) { printbuf_tabstop_push(out, 20); - prt_str(out, "rewrite ptrs:"); - prt_tab(out); + prt_str(out, "rewrite ptrs:\t"); bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); prt_newline(out); - prt_str(out, "kill ptrs: "); - prt_tab(out); + prt_str(out, "kill ptrs:\t"); bch2_prt_u64_base2(out, data_opts->kill_ptrs); prt_newline(out); - prt_str(out, "target: "); - prt_tab(out); + prt_str(out, "target:\t"); bch2_target_to_text(out, c, data_opts->target); prt_newline(out); - prt_str(out, "compression: "); - prt_tab(out); + prt_str(out, "compression:\t"); bch2_compression_opt_to_text(out, background_compression(*io_opts)); prt_newline(out); - prt_str(out, "extra replicas: "); - prt_tab(out); + prt_str(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); } @@ -421,7 +416,7 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, io_opts->d.nr = 0; ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_all_snapshots, k, ({ if (k.k->p.offset != extent_k.k->p.inode) break; @@ -467,7 +462,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; @@ -553,8 +548,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt, } bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); @@ -695,6 +690,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bpos bp_pos = POS_MIN; int ret = 0; + struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + if (!ca) + return 0; + trace_bucket_evacuate(c, &bucket); bch2_bkey_buf_init(&sk); @@ -705,7 +704,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED); + bucket, BTREE_ITER_cached); ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); bch2_trans_iter_exit(trans, &iter); @@ -716,7 +715,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, a = bch2_alloc_to_v4(k, &a_convert); dirty_sectors = bch2_bucket_sectors_dirty(*a); - bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; + bucket_size = ca->mi.bucket_size; fragmentation = a->fragmentation_lru; ret = bch2_btree_write_buffer_tryflush(trans); @@ -730,9 +729,9 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); - ret = bch2_get_next_backpointer(trans, bucket, gen, + ret = bch2_get_next_backpointer(trans, ca, bucket, gen, &bp_pos, &bp, - BTREE_ITER_CACHED); + BTREE_ITER_cached); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) @@ -828,6 +827,7 @@ next: trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); err: + bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); return ret; } @@ -868,7 +868,7 @@ static int bch2_move_btree(struct bch_fs *c, continue; bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); retry: ret = 0; while (bch2_trans_begin(trans), @@ -975,26 +975,10 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg, */ static bool bformat_needs_redo(struct bkey_format *f) { - for (unsigned i = 0; i < f->nr_fields; i++) { - unsigned f_bits = f->bits_per_field[i]; - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (f_bits > unpacked_bits) - return true; - - if ((f_bits == unpacked_bits) && field_offset) + for (unsigned i = 0; i < f->nr_fields; i++) + if (bch2_bkey_format_field_overflows(f, i)) return true; - u64 f_mask = f_bits - ? ~((~0ULL << (f_bits - 1)) << 1) - : 0; - - if (((field_offset + f_mask) & unpacked_mask) < field_offset) - return true; - } - return false; } @@ -1049,6 +1033,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, struct extent_ptr_decoded p; unsigned i = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { unsigned d = bch2_extent_ptr_durability(c, &p); @@ -1059,6 +1044,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, i++; } + rcu_read_unlock(); return data_opts->kill_ptrs != 0; } @@ -1143,23 +1129,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "keys moved: "); - prt_u64(out, atomic64_read(&stats->keys_moved)); - prt_newline(out); - - prt_str(out, "keys raced: "); - prt_u64(out, atomic64_read(&stats->keys_raced)); - prt_newline(out); - - prt_str(out, "bytes seen: "); + prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_str(out, "bytes moved: "); + prt_printf(out, "bytes moved: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_str(out, "bytes raced: "); + prt_printf(out, "bytes raced: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1173,19 +1153,17 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); - prt_printf(out, "reads: ios %u/%u sectors %u/%u", + prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", atomic_read(&ctxt->read_ios), c->opts.move_ios_in_flight, atomic_read(&ctxt->read_sectors), c->opts.move_bytes_in_flight >> 9); - prt_newline(out); - prt_printf(out, "writes: ios %u/%u sectors %u/%u", + prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", atomic_read(&ctxt->write_ios), c->opts.move_ios_in_flight, atomic_read(&ctxt->write_sectors), c->opts.move_bytes_in_flight >> 9); - prt_newline(out); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 0d2b82d8d11f..10bfb31c151b 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -84,7 +84,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, return 0; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_CACHED); + b->k.bucket, BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -158,6 +158,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) return ret; + bch2_trans_begin(trans); + ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 84e452835a17..25530e0bb2f3 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -426,11 +426,6 @@ enum fsck_err_opts { BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ - x(buckets_nouse, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allocate the buckets_nouse bitmap") \ x(stdio, u64, \ 0, \ OPT_UINT(0, S64_MAX), \ @@ -480,7 +475,7 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\ + NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ " prefetched sequentially") struct bch_opts { diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index b27d22925929..8b0369185f5c 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -10,35 +10,50 @@ #include "printbuf.h" -static inline unsigned printbuf_linelen(struct printbuf *buf) +static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos) { - return buf->pos - buf->last_newline; + return pos - buf->last_newline; } -int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +static inline unsigned printbuf_linelen(struct printbuf *buf) { - unsigned new_size; - char *buf; + return __printbuf_linelen(buf, buf->pos); +} - if (!out->heap_allocated) - return 0; +/* + * Returns spaces from start of line, if set, or 0 if unset: + */ +static inline unsigned cur_tabstop(struct printbuf *buf) +{ + return buf->cur_tabstop < buf->nr_tabstops + ? buf->_tabstops[buf->cur_tabstop] + : 0; +} +int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +{ /* Reserved space for terminating nul: */ extra += 1; - if (out->pos + extra < out->size) + if (out->pos + extra <= out->size) return 0; - new_size = roundup_pow_of_two(out->size + extra); + if (!out->heap_allocated) { + out->overflow = true; + return 0; + } + + unsigned new_size = roundup_pow_of_two(out->size + extra); /* * Note: output buffer must be freeable with kfree(), it's not required * that the user use printbuf_exit(). */ - buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); + char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); if (!buf) { out->allocation_failure = true; + out->overflow = true; return -ENOMEM; } @@ -47,6 +62,92 @@ int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) return 0; } +static void printbuf_advance_pos(struct printbuf *out, unsigned len) +{ + out->pos += min(len, printbuf_remaining(out)); +} + +static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr) +{ + unsigned move = out->pos - pos; + + bch2_printbuf_make_room(out, nr); + + if (pos + nr < out->size) + memmove(out->buf + pos + nr, + out->buf + pos, + min(move, out->size - 1 - pos - nr)); + + if (pos < out->size) + memset(out->buf + pos, ' ', min(nr, out->size - pos)); + + printbuf_advance_pos(out, nr); + printbuf_nul_terminate_reserved(out); +} + +static void __printbuf_do_indent(struct printbuf *out, unsigned pos) +{ + while (true) { + int pad; + unsigned len = out->pos - pos; + char *p = out->buf + pos; + char *n = memscan(p, '\n', len); + if (cur_tabstop(out)) { + n = min(n, (char *) memscan(p, '\r', len)); + n = min(n, (char *) memscan(p, '\t', len)); + } + + pos = n - out->buf; + if (pos == out->pos) + break; + + switch (*n) { + case '\n': + pos++; + out->last_newline = pos; + + printbuf_insert_spaces(out, pos, out->indent); + + pos = min(pos + out->indent, out->pos); + out->last_field = pos; + out->cur_tabstop = 0; + break; + case '\r': + memmove(n, n + 1, out->pos - pos); + --out->pos; + pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos); + if (pad > 0) { + printbuf_insert_spaces(out, out->last_field, pad); + pos += pad; + } + + out->last_field = pos; + out->cur_tabstop++; + break; + case '\t': + pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1; + if (pad > 0) { + *n = ' '; + printbuf_insert_spaces(out, pos, pad - 1); + pos += pad; + } else { + memmove(n, n + 1, out->pos - pos); + --out->pos; + } + + out->last_field = pos; + out->cur_tabstop++; + break; + } + } +} + +static inline void printbuf_do_indent(struct printbuf *out, unsigned pos) +{ + if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling) + __printbuf_do_indent(out, pos); +} + void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) { int len; @@ -55,14 +156,14 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) va_list args2; va_copy(args2, args); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2); va_end(args2); - } while (len + 1 >= printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len + 1)); + } while (len > printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len)); - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; + unsigned indent_pos = out->pos; + printbuf_advance_pos(out, len); + printbuf_do_indent(out, indent_pos); } void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) @@ -72,14 +173,14 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) do { va_start(args, fmt); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args); va_end(args); - } while (len + 1 >= printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len + 1)); + } while (len > printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len)); - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; + unsigned indent_pos = out->pos; + printbuf_advance_pos(out, len); + printbuf_do_indent(out, indent_pos); } /** @@ -194,33 +295,20 @@ void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) void bch2_prt_newline(struct printbuf *buf) { - unsigned i; - bch2_printbuf_make_room(buf, 1 + buf->indent); - __prt_char(buf, '\n'); + __prt_char_reserved(buf, '\n'); buf->last_newline = buf->pos; - for (i = 0; i < buf->indent; i++) - __prt_char(buf, ' '); + __prt_chars_reserved(buf, ' ', buf->indent); - printbuf_nul_terminate(buf); + printbuf_nul_terminate_reserved(buf); buf->last_field = buf->pos; buf->cur_tabstop = 0; } -/* - * Returns spaces from start of line, if set, or 0 if unset: - */ -static inline unsigned cur_tabstop(struct printbuf *buf) -{ - return buf->cur_tabstop < buf->nr_tabstops - ? buf->_tabstops[buf->cur_tabstop] - : 0; -} - static void __prt_tab(struct printbuf *out) { int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); @@ -247,24 +335,9 @@ void bch2_prt_tab(struct printbuf *out) static void __prt_tab_rjust(struct printbuf *buf) { - unsigned move = buf->pos - buf->last_field; int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); - - if (pad > 0) { - bch2_printbuf_make_room(buf, pad); - - if (buf->last_field + pad < buf->size) - memmove(buf->buf + buf->last_field + pad, - buf->buf + buf->last_field, - min(move, buf->size - 1 - buf->last_field - pad)); - - if (buf->last_field < buf->size) - memset(buf->buf + buf->last_field, ' ', - min((unsigned) pad, buf->size - buf->last_field)); - - buf->pos += pad; - printbuf_nul_terminate(buf); - } + if (pad > 0) + printbuf_insert_spaces(buf, buf->last_field, pad); buf->last_field = buf->pos; buf->cur_tabstop++; @@ -301,41 +374,9 @@ void bch2_prt_tab_rjust(struct printbuf *buf) */ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) { - const char *unprinted_start = str; - const char *end = str + count; - - if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) { - prt_bytes(out, str, count); - return; - } - - while (str != end) { - switch (*str) { - case '\n': - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - bch2_prt_newline(out); - break; - case '\t': - if (likely(cur_tabstop(out))) { - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - __prt_tab(out); - } - break; - case '\r': - if (likely(cur_tabstop(out))) { - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - __prt_tab_rjust(out); - } - break; - } - - str++; - } - - prt_bytes(out, unprinted_start, str - unprinted_start); + unsigned indent_pos = out->pos; + prt_bytes(out, str, count); + printbuf_do_indent(out, indent_pos); } /** @@ -348,9 +389,10 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) { bch2_printbuf_make_room(out, 10); - out->pos += string_get_size(v, 1, !out->si_units, - out->buf + out->pos, - printbuf_remaining_size(out)); + unsigned len = string_get_size(v, 1, !out->si_units, + out->buf + out->pos, + printbuf_remaining_size(out)); + printbuf_advance_pos(out, len); } /** @@ -402,9 +444,7 @@ void bch2_prt_string_option(struct printbuf *out, const char * const list[], size_t selected) { - size_t i; - - for (i = 0; list[i]; i++) + for (size_t i = 0; list[i]; i++) bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); } diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 9a4a56c40937..9ecc56bc9635 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -86,6 +86,7 @@ struct printbuf { u8 atomic; bool allocation_failure:1; bool heap_allocated:1; + bool overflow:1; enum printbuf_si si_units:1; bool human_readable_units:1; bool has_indent_or_tabstops:1; @@ -142,7 +143,9 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], */ static inline unsigned printbuf_remaining_size(struct printbuf *out) { - return out->pos < out->size ? out->size - out->pos : 0; + if (WARN_ON(out->size && out->pos >= out->size)) + out->pos = out->size - 1; + return out->size - out->pos; } /* @@ -151,7 +154,7 @@ static inline unsigned printbuf_remaining_size(struct printbuf *out) */ static inline unsigned printbuf_remaining(struct printbuf *out) { - return out->pos < out->size ? out->size - out->pos - 1 : 0; + return out->size ? printbuf_remaining_size(out) - 1 : 0; } static inline unsigned printbuf_written(struct printbuf *out) @@ -159,30 +162,25 @@ static inline unsigned printbuf_written(struct printbuf *out) return out->size ? min(out->pos, out->size - 1) : 0; } -/* - * Returns true if output was truncated: - */ -static inline bool printbuf_overflowed(struct printbuf *out) +static inline void printbuf_nul_terminate_reserved(struct printbuf *out) { - return out->pos >= out->size; + if (WARN_ON(out->size && out->pos >= out->size)) + out->pos = out->size - 1; + if (out->size) + out->buf[out->pos] = 0; } static inline void printbuf_nul_terminate(struct printbuf *out) { bch2_printbuf_make_room(out, 1); - - if (out->pos < out->size) - out->buf[out->pos] = 0; - else if (out->size) - out->buf[out->size - 1] = 0; + printbuf_nul_terminate_reserved(out); } /* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ static inline void __prt_char_reserved(struct printbuf *out, char c) { if (printbuf_remaining(out)) - out->buf[out->pos] = c; - out->pos++; + out->buf[out->pos++] = c; } /* Doesn't nul terminate: */ @@ -194,37 +192,34 @@ static inline void __prt_char(struct printbuf *out, char c) static inline void prt_char(struct printbuf *out, char c) { - __prt_char(out, c); - printbuf_nul_terminate(out); + bch2_printbuf_make_room(out, 2); + __prt_char_reserved(out, c); + printbuf_nul_terminate_reserved(out); } static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) { - unsigned i, can_print = min(n, printbuf_remaining(out)); + unsigned can_print = min(n, printbuf_remaining(out)); - for (i = 0; i < can_print; i++) + for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = c; - out->pos += n - can_print; } static inline void prt_chars(struct printbuf *out, char c, unsigned n) { bch2_printbuf_make_room(out, n); __prt_chars_reserved(out, c, n); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) { - unsigned i, can_print; - bch2_printbuf_make_room(out, n); - can_print = min(n, printbuf_remaining(out)); + unsigned can_print = min(n, printbuf_remaining(out)); - for (i = 0; i < can_print; i++) + for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = ((char *) b)[i]; - out->pos += n - can_print; printbuf_nul_terminate(out); } @@ -241,18 +236,18 @@ static inline void prt_str_indented(struct printbuf *out, const char *str) static inline void prt_hex_byte(struct printbuf *out, u8 byte) { - bch2_printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_hi(byte)); __prt_char_reserved(out, hex_asc_lo(byte)); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) { - bch2_printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_upper_hi(byte)); __prt_char_reserved(out, hex_asc_upper_lo(byte)); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } /** diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 556da0738106..a0cca8b70e0a 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -20,7 +20,7 @@ static const char * const bch2_quota_counters[] = { }; static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_quota *q = field_to_type(f, quota); @@ -60,8 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { }; int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -97,45 +96,14 @@ static void qc_info_to_text(struct printbuf *out, struct qc_info *i) printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); - prt_str(out, "i_fieldmask"); - prt_tab(out); - prt_printf(out, "%x", i->i_fieldmask); - prt_newline(out); - - prt_str(out, "i_flags"); - prt_tab(out); - prt_printf(out, "%u", i->i_flags); - prt_newline(out); - - prt_str(out, "i_spc_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_spc_timelimit); - prt_newline(out); - - prt_str(out, "i_ino_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_ino_timelimit); - prt_newline(out); - - prt_str(out, "i_rt_spc_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_rt_spc_timelimit); - prt_newline(out); - - prt_str(out, "i_spc_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_spc_warnlimit); - prt_newline(out); - - prt_str(out, "i_ino_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_ino_warnlimit); - prt_newline(out); - - prt_str(out, "i_rt_spc_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_rt_spc_warnlimit); - prt_newline(out); + prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask); + prt_printf(out, "i_flags\t%u\n", i->i_flags); + prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit); + prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit); + prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit); + prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit); + prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit); + prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit); } static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) @@ -143,60 +111,17 @@ static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); - prt_str(out, "d_fieldmask"); - prt_tab(out); - prt_printf(out, "%x", q->d_fieldmask); - prt_newline(out); - - prt_str(out, "d_spc_hardlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_hardlimit); - prt_newline(out); - - prt_str(out, "d_spc_softlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_softlimit); - prt_newline(out); - - prt_str(out, "d_ino_hardlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_hardlimit); - prt_newline(out); - - prt_str(out, "d_ino_softlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_softlimit); - prt_newline(out); - - prt_str(out, "d_space"); - prt_tab(out); - prt_printf(out, "%llu", q->d_space); - prt_newline(out); - - prt_str(out, "d_ino_count"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_count); - prt_newline(out); - - prt_str(out, "d_ino_timer"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_timer); - prt_newline(out); - - prt_str(out, "d_spc_timer"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_timer); - prt_newline(out); - - prt_str(out, "d_ino_warns"); - prt_tab(out); - prt_printf(out, "%i", q->d_ino_warns); - prt_newline(out); - - prt_str(out, "d_spc_warns"); - prt_tab(out); - prt_printf(out, "%i", q->d_spc_warns); - prt_newline(out); + prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask); + prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit); + prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit); + prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit); + prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit); + prt_printf(out, "d_space\t%llu\n", q->d_space); + prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count); + prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer); + prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer); + prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns); + prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns); } static inline unsigned __next_qtype(unsigned i, unsigned qtypes) @@ -610,10 +535,10 @@ int bch2_fs_quota_read(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, __bch2_quota_set(c, k, NULL)) ?: for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, bch2_fs_quota_read_inode(trans, &iter, k))); bch_err_fn(c, ret); return ret; @@ -900,7 +825,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); ret = bkey_err(k); if (unlikely(ret)) return ret; diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 884f601f41c4..02d37a332218 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -5,11 +5,11 @@ #include "inode.h" #include "quota_types.h" -enum bkey_invalid_flags; +enum bch_validate_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 56336f3dd1d0..cf81e5128c3a 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -42,7 +42,7 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -89,7 +89,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -140,7 +140,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, bch2_trans_iter_init(trans, extent_iter, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_all_snapshots); k = bch2_btree_iter_peek_slot(extent_iter); if (bkey_err(k)) return k; @@ -323,12 +323,14 @@ static int do_rebalance(struct moving_context *ctxt) struct bkey_s_c k; int ret = 0; + bch2_trans_begin(trans); + bch2_move_stats_init(&r->work_stats, "rebalance_work"); bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); bch2_trans_iter_init(trans, &rebalance_work_iter, BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { if (!r->enabled) { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 8091d0686029..1266916ac03f 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -65,9 +65,20 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); @@ -125,9 +136,9 @@ static int bch2_journal_replay_key(struct btree_trans *trans, { struct btree_iter iter; unsigned iter_flags = - BTREE_ITER_INTENT| - BTREE_ITER_NOT_EXTENTS; - unsigned update_flags = BTREE_TRIGGER_NORUN; + BTREE_ITER_intent| + BTREE_ITER_not_extents; + unsigned update_flags = BTREE_TRIGGER_norun; int ret; if (k->overwritten) @@ -136,17 +147,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans, trans->journal_res.seq = k->journal_seq; /* - * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to + * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to * keep the key cache coherent with the underlying btree. Nothing * besides the allocator is doing updates yet so we don't need key cache * coherency for non-alloc btrees, and key cache fills for snapshots - * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until + * btrees use BTREE_ITER_filter_snapshots, which isn't available until * the snapshots recovery pass runs. */ if (!k->level && k->btree_id == BTREE_ID_alloc) - iter_flags |= BTREE_ITER_CACHED; + iter_flags |= BTREE_ITER_cached; else - update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; + update_flags |= BTREE_UPDATE_key_cache_reclaim; bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, @@ -191,7 +202,7 @@ int bch2_journal_replay(struct bch_fs *c) struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - struct btree_trans *trans = bch2_trans_get(c); + struct btree_trans *trans = NULL; bool immediate_flush = false; int ret = 0; @@ -205,6 +216,7 @@ int bch2_journal_replay(struct bch_fs *c) BUG_ON(!atomic_read(&keys->ref)); move_gap(keys, keys->nr); + trans = bch2_trans_get(c); /* * First, attempt to replay keys in sorted order. This is more @@ -361,14 +373,17 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_dev_usage: { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); - struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); - unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - - for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { - ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); - ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); - ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); - } + unsigned nr_types = jset_entry_dev_usage_nr_types(u); + + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, le32_to_cpu(u->dev)); + if (ca) + for (unsigned i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + rcu_read_unlock(); break; } @@ -597,56 +612,54 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.norecovery) c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; - if (!c->opts.nochanges) { - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; - - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); - write_sb = true; - } + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + bool write_sb = false; - u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - if (sb_passes) { - struct printbuf buf = PRINTBUF; - prt_str(&buf, "superblock requires following recovery passes to be run:\n "); - prt_bitflags(&buf, bch2_recovery_passes, sb_passes); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { + ext->recovery_passes_required[0] |= + cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); + write_sb = true; + } - if (bch2_check_version_downgrade(c)) { - struct printbuf buf = PRINTBUF; + u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (sb_passes) { + struct printbuf buf = PRINTBUF; + prt_str(&buf, "superblock requires following recovery passes to be run:\n "); + prt_bitflags(&buf, bch2_recovery_passes, sb_passes); + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } - prt_str(&buf, "Version downgrade required:"); + if (bch2_check_version_downgrade(c)) { + struct printbuf buf = PRINTBUF; - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_downgrade(c, - BCH_VERSION_MINOR(bcachefs_metadata_version_current), - BCH_VERSION_MINOR(c->sb.version)); - passes = ext->recovery_passes_required[0] & ~passes; - if (passes) { - prt_str(&buf, "\n running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } + prt_str(&buf, "Version downgrade required:"); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - write_sb = true; + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_downgrade(c, + BCH_VERSION_MINOR(bcachefs_metadata_version_current), + BCH_VERSION_MINOR(c->sb.version)); + passes = ext->recovery_passes_required[0] & ~passes; + if (passes) { + prt_str(&buf, "\n running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } - if (check_version_upgrade(c)) - write_sb = true; + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + write_sb = true; + } - if (write_sb) - bch2_write_super(c); + if (check_version_upgrade(c)) + write_sb = true; - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - mutex_unlock(&c->sb_lock); - } + if (write_sb) + bch2_write_super(c); + + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + mutex_unlock(&c->sb_lock); if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); @@ -660,7 +673,9 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) { + bch2_journal_pos_from_member_info_resume(c); + + if (!c->sb.clean || c->opts.retain_recovery_info) { struct genradix_iter iter; struct journal_replay **i; @@ -832,8 +847,8 @@ use_clean: } mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; + ext = bch2_sb_field_get(c->disk_sb.sb, ext); + write_sb = false; if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); @@ -868,6 +883,9 @@ use_clean: write_sb = true; } + if (bch2_blacklist_entries_gc(c)) + write_sb = true; + if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -890,10 +908,6 @@ use_clean: bch_info(c, "scanning for old btree nodes done"); } - if (c->journal_seq_blacklist_table && - c->journal_seq_blacklist_table->nr > 128) - queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); - ret = 0; out: bch2_flush_fsck_errs(c); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 0cec0f7d9703..4a9eb9582b6e 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -26,11 +26,6 @@ const char * const bch2_recovery_passes[] = { NULL }; -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, false); -} - static int bch2_set_may_go_rw(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; @@ -227,7 +222,8 @@ int bch2_run_recovery_passes(struct bch_fs *c) if (should_run_recovery_pass(c, c->curr_recovery_pass)) { unsigned pass = c->curr_recovery_pass; - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?: + bch2_journal_flush(&c->journal); if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || (ret && c->curr_recovery_pass < pass)) continue; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index ff7864731a07..9ac6cf21cfbf 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -30,7 +30,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); @@ -74,20 +74,20 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags) + struct bkey_s_c_reflink_p p, u64 *idx, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_i *k; __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; struct printbuf buf = PRINTBUF; int ret; k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_with_updates); ret = PTR_ERR_OR_ZERO(k); if (ret) goto err; @@ -102,7 +102,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, goto err; } - if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { bch2_bkey_val_to_text(&buf, c, p.s_c); bch2_trans_inconsistent(trans, "indirect extent refcount underflow at %llu while marking\n %s", @@ -111,7 +111,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, goto err; } - if (flags & BTREE_TRIGGER_INSERT) { + if (flags & BTREE_TRIGGER_insert) { struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; u64 pad; @@ -141,12 +141,13 @@ err: } static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags, size_t r_idx) + struct bkey_s_c_reflink_p p, u64 *idx, + enum btree_iter_update_trigger_flags flags, + size_t r_idx) { struct bch_fs *c = trans->c; struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; u64 start = le64_to_cpu(p.v->idx); u64 end = le64_to_cpu(p.v->idx) + p.k->size; u64 next_idx = end + le32_to_cpu(p.v->back_pad); @@ -163,10 +164,13 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, BUG_ON((s64) r->refcount + add < 0); - r->refcount += add; + if (flags & BTREE_TRIGGER_gc) + r->refcount += add; *idx = r->offset; return 0; not_found: + BUG_ON(!(flags & BTREE_TRIGGER_check_repair)); + if (fsck_err(c, reflink_p_to_missing_reflink_v, "pointer to missing indirect extent\n" " %s\n" @@ -189,7 +193,7 @@ not_found: set_bkey_val_u64s(&update->k, 0); } - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN); + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun); } *idx = next_idx; @@ -200,8 +204,8 @@ fsck_err: } static int __trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + enum btree_id btree_id, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); @@ -210,12 +214,12 @@ static int __trigger_reflink_p(struct btree_trans *trans, u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { while (idx < end && !ret) ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); } - if (flags & BTREE_TRIGGER_GC) { + if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) { size_t l = 0, r = c->reflink_gc_nr; while (l < r) { @@ -238,10 +242,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && - (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_transactional) && + (flags & BTREE_TRIGGER_insert)) { struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; v->front_pad = v->back_pad = 0; @@ -253,7 +257,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, /* indirect extents */ int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { return bch2_bkey_ptrs_invalid(c, k, flags, err); @@ -281,23 +285,25 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } #endif -static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags) +static inline void +check_indirect_extent_deleting(struct bkey_s new, + enum btree_iter_update_trigger_flags *flags) { - if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { + if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) { new.k->type = KEY_TYPE_deleted; new.k->size = 0; set_bkey_val_u64s(new.k, 0); - *flags &= ~BTREE_TRIGGER_INSERT; + *flags &= ~BTREE_TRIGGER_insert; } } int bch2_trigger_reflink_v(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && - (flags & BTREE_TRIGGER_INSERT)) + if ((flags & BTREE_TRIGGER_transactional) && + (flags & BTREE_TRIGGER_insert)) check_indirect_extent_deleting(new, &flags); return bch2_trigger_extent(trans, btree_id, level, old, new, flags); @@ -306,7 +312,7 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, /* indirect inline data */ int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { return 0; @@ -326,7 +332,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { check_indirect_extent_deleting(new, &flags); @@ -349,7 +355,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_prev(&reflink_iter); ret = bkey_err(k); if (ret) @@ -394,7 +400,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); err: bch2_trans_iter_exit(trans, &reflink_iter); @@ -455,9 +461,9 @@ s64 bch2_remap_range(struct bch_fs *c, goto err; bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, - BTREE_ITER_INTENT); + BTREE_ITER_intent); bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, - BTREE_ITER_INTENT); + BTREE_ITER_intent); while ((ret == 0 || bch2_err_matches(ret, BCH_ERR_transaction_restart)) && @@ -567,7 +573,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_begin(trans); ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, - dst_inum, BTREE_ITER_INTENT); + dst_inum, BTREE_ITER_intent); if (!ret2 && inode_u.bi_size < new_i_size) { diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 4d8867289717..e894f3a2c67a 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -2,15 +2,16 @@ #ifndef _BCACHEFS_REFLINK_H #define _BCACHEFS_REFLINK_H -enum bkey_invalid_flags; +enum bch_validate_flags; int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ @@ -21,11 +22,12 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, }) int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ @@ -36,13 +38,13 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, }) int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, - unsigned); + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 678b9c20e251..bd1d5d085e23 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -84,7 +84,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, } for (unsigned i = 0; i < r->nr_devs; i++) - if (!bch2_dev_exists(sb, r->devs[i])) { + if (!bch2_member_exists(sb, r->devs[i])) { prt_printf(err, "invalid device %u in entry ", r->devs[i]); goto bad; } @@ -200,7 +200,7 @@ cpu_replicas_add_entry(struct bch_fs *c, }; for (i = 0; i < new_entry->nr_devs; i++) - BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i])); + BUG_ON(!bch2_dev_exists(c, new_entry->devs[i])); BUG_ON(!new_entry->data_type); verify_replicas_entry(new_entry); @@ -860,7 +860,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, } static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); struct bch_replicas_cpu cpu_r; @@ -899,7 +899,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = { }; static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); struct bch_replicas_cpu cpu_r; @@ -947,18 +947,20 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, percpu_down_read(&c->mark_lock); for_each_cpu_replicas_entry(&c->replicas, e) { - unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; + unsigned nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; if (e->data_type == BCH_DATA_cached) continue; - for (i = 0; i < e->nr_devs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); - + rcu_read_lock(); + for (unsigned i = 0; i < e->nr_devs; i++) { nr_online += test_bit(e->devs[i], devs.d); - nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; + + struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]); + nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed; } + rcu_read_unlock(); if (nr_failed == e->nr_devs) continue; diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 194e55b11137..47f10ab57f40 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -266,9 +266,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, } } -static int bch2_sb_clean_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_clean *clean = field_to_type(f, clean); @@ -283,7 +282,7 @@ static int bch2_sb_clean_validate(struct bch_sb *sb, entry = vstruct_next(entry)) { if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) { prt_str(err, "entry type "); - bch2_prt_jset_entry_type(err, le16_to_cpu(entry->type)); + bch2_prt_jset_entry_type(err, entry->type); prt_str(err, " overruns end of section"); return -BCH_ERR_invalid_sb_clean; } @@ -298,10 +297,8 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_clean *clean = field_to_type(f, clean); struct jset_entry *entry; - prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); - prt_newline(out); - prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); - prt_newline(out); + prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags)); + prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq)); for (entry = clean->start; entry != vstruct_end(&clean->field); @@ -392,6 +389,8 @@ void bch2_fs_mark_clean(struct bch_fs *c) goto out; } + bch2_journal_pos_from_member_info_set(c); + bch2_write_super(c); out: mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c index 7dc898761bb3..6992e7469112 100644 --- a/fs/bcachefs/sb-counters.c +++ b/fs/bcachefs/sb-counters.c @@ -20,9 +20,8 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; }; -static int bch2_sb_counters_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { return 0; }; @@ -31,19 +30,12 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_counters *ctrs = field_to_type(f, counters); - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - for (i = 0; i < nr; i++) { - if (i < BCH_COUNTER_NR) - prt_printf(out, "%s ", bch2_counter_names[i]); - else - prt_printf(out, "(unknown)"); - - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); - prt_newline(out); - } + for (unsigned i = 0; i < nr; i++) + prt_printf(out, "%s \t%llu\n", + i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", + le64_to_cpu(ctrs->d[i])); }; int bch2_sb_counters_to_cpu(struct bch_fs *c) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index a98ef940b7a3..390a1bbd2567 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -134,15 +134,25 @@ downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) #define for_each_downgrade_entry(_d, _i) \ for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \ (void *) _i < vstruct_end(&(_d)->field) && \ - (void *) &_i->errors[0] < vstruct_end(&(_d)->field); \ + (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \ + (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \ _i = downgrade_entry_next_c(_i)) static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); - for_each_downgrade_entry(e, i) { + for (const struct bch_sb_field_downgrade_entry *i = e->entries; + (void *) i < vstruct_end(&e->field); + i = downgrade_entry_next_c(i)) { + if (flags & BCH_VALIDATE_write && + ((void *) &i->errors[0] > vstruct_end(&e->field) || + (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field))) { + prt_printf(err, "downgrade entry overruns end of superblock section)"); + return -BCH_ERR_invalid_sb_downgrade; + } + if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) != BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) { prt_printf(err, "downgrade entry with mismatched major version (%u != %u)", @@ -164,19 +174,16 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, printbuf_tabstop_push(out, 16); for_each_downgrade_entry(e, i) { - prt_str(out, "version:"); - prt_tab(out); + prt_str(out, "version:\t"); bch2_version_to_text(out, le16_to_cpu(i->version)); prt_newline(out); - prt_str(out, "recovery passes:"); - prt_tab(out); + prt_str(out, "recovery passes:\t"); prt_bitflags(out, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0]))); prt_newline(out); - prt_str(out, "errors:"); - prt_tab(out); + prt_str(out, "errors:\t"); bool first = true; for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { if (!first) diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index 5f5bcae391fb..bda33e59e226 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -30,7 +30,7 @@ static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) } static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_errors *e = field_to_type(f, errors); unsigned i, nr = bch2_sb_field_errors_nr_entries(e); diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 06c7a644f4a4..87324747351a 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -272,7 +272,8 @@ x(snapshot_node_missing, 264) \ x(dup_backpointer_to_bad_csum_extent, 265) \ x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) + x(sb_clean_entry_overrun, 267) \ + x(btree_ptr_v2_written_0, 268) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 44b3f0cb7b49..39196f2a4197 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -3,11 +3,22 @@ #include "bcachefs.h" #include "btree_cache.h" #include "disk_groups.h" +#include "error.h" #include "opts.h" #include "replicas.h" #include "sb-members.h" #include "super-io.h" +void bch2_dev_missing(struct bch_fs *c, unsigned dev) +{ + bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); +} + +void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) +{ + bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset); +} + #define x(t, n, ...) [n] = #t, static const char * const bch2_iops_measurements[] = { BCH_IOPS_MEASUREMENTS() @@ -164,18 +175,14 @@ static void member_to_text(struct printbuf *out, u64 bucket_size = le16_to_cpu(m.bucket_size); u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; - if (!bch2_member_exists(&m)) + if (!bch2_member_alive(&m)) return; - prt_printf(out, "Device:"); - prt_tab(out); - prt_printf(out, "%u", i); - prt_newline(out); + prt_printf(out, "Device:\t%u\n", i); printbuf_indent_add(out, 2); - prt_printf(out, "Label:"); - prt_tab(out); + prt_printf(out, "Label:\t"); if (BCH_MEMBER_GROUP(&m)) { unsigned idx = BCH_MEMBER_GROUP(&m) - 1; @@ -189,103 +196,73 @@ static void member_to_text(struct printbuf *out, } prt_newline(out); - prt_printf(out, "UUID:"); - prt_tab(out); + prt_printf(out, "UUID:\t"); pr_uuid(out, m.uuid.b); prt_newline(out); - prt_printf(out, "Size:"); - prt_tab(out); + prt_printf(out, "Size:\t"); prt_units_u64(out, device_size << 9); prt_newline(out); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s errors:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); - for (unsigned i = 0; i < BCH_IOPS_NR; i++) { - prt_printf(out, "%s iops:", bch2_iops_measurements[i]); - prt_tab(out); - prt_printf(out, "%u", le32_to_cpu(m.iops[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_IOPS_NR; i++) + prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); - prt_printf(out, "Bucket size:"); - prt_tab(out); + prt_printf(out, "Bucket size:\t"); prt_units_u64(out, bucket_size << 9); prt_newline(out); - prt_printf(out, "First bucket:"); - prt_tab(out); - prt_printf(out, "%u", le16_to_cpu(m.first_bucket)); - prt_newline(out); - - prt_printf(out, "Buckets:"); - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(m.nbuckets)); - prt_newline(out); + prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); + prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); - prt_printf(out, "Last mount:"); - prt_tab(out); + prt_printf(out, "Last mount:\t"); if (m.last_mount) bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); else prt_printf(out, "(never)"); prt_newline(out); - prt_printf(out, "Last superblock write:"); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.seq)); - prt_newline(out); + prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); - prt_printf(out, "State:"); - prt_tab(out); - prt_printf(out, "%s", + prt_printf(out, "State:\t%s\n", BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR ? bch2_member_states[BCH_MEMBER_STATE(&m)] : "unknown"); - prt_newline(out); - prt_printf(out, "Data allowed:"); - prt_tab(out); + prt_printf(out, "Data allowed:\t"); if (BCH_MEMBER_DATA_ALLOWED(&m)) prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); - prt_printf(out, "Has data:"); - prt_tab(out); + prt_printf(out, "Has data:\t"); if (data_have) prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); - prt_str(out, "Durability:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + prt_printf(out, "Btree allocated bitmap blocksize:\t"); + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); prt_newline(out); - prt_printf(out, "Discard:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Btree allocated bitmap:\t"); + bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); prt_newline(out); - prt_printf(out, "Freespace initialized:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); - prt_newline(out); + prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + + prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); printbuf_indent_sub(out, 2); } -static int bch2_sb_members_v1_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); unsigned i; @@ -333,9 +310,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, member_to_text(out, members_v2_get(mi, i), gi, sb, i); } -static int bch2_sb_members_v2_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - @@ -390,12 +366,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); printbuf_indent_sub(out, 2); prt_str(out, "IO errors since "); @@ -404,12 +376,9 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], + atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); printbuf_indent_sub(out, 2); } @@ -437,11 +406,20 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) { - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) - if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev), - ptr->offset, btree_sectors(c))) - return false; - return true; + bool ret = true; + rcu_read_lock(); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + + if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) { + ret = false; + break; + } + } + rcu_read_unlock(); + return ret; } static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, @@ -463,6 +441,9 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns m->btree_bitmap_shift += resize; } + BUG_ON(m->btree_bitmap_shift > 57); + BUG_ON(end > 64ULL << m->btree_bitmap_shift); + for (unsigned bit = start >> m->btree_bitmap_shift; (u64) bit << m->btree_bitmap_shift < end; bit++) @@ -476,6 +457,10 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) lockdep_assert_held(&c->sb_lock); struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (!bch2_member_exists(c->disk_sb.sb, ptr->dev)) + continue; + __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); + } } diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 5bf27d30ca29..dd93192ec065 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -29,19 +29,6 @@ static inline bool bch2_dev_is_readable(struct bch_dev *ca) ca->mi.state != BCH_MEMBER_STATE_failed; } -static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -{ - if (!percpu_ref_tryget(&ca->io_ref)) - return false; - - if (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) - return true; - - percpu_ref_put(&ca->io_ref); - return false; -} - static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) { return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); @@ -105,14 +92,41 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * for (struct bch_dev *_ca = NULL; \ (_ca = __bch2_next_dev((_c), _ca, (_mask)));) -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) +static inline void bch2_dev_get(struct bch_dev *ca) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L); +#else + percpu_ref_get(&ca->ref); +#endif +} + +static inline void __bch2_dev_put(struct bch_dev *ca) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + long r = atomic_long_dec_return(&ca->ref); + if (r < (long) !ca->dying) + panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put); + ca->last_put = _THIS_IP_; + if (!r) + complete(&ca->ref_completion); +#else + percpu_ref_put(&ca->ref); +#endif +} + +static inline void bch2_dev_put(struct bch_dev *ca) { - rcu_read_lock(); if (ca) - percpu_ref_put(&ca->ref); + __bch2_dev_put(ca); +} +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) +{ + rcu_read_lock(); + bch2_dev_put(ca); if ((ca = __bch2_next_dev(c, ca, NULL))) - percpu_ref_get(&ca->ref); + bch2_dev_get(ca); rcu_read_unlock(); return ca; @@ -158,26 +172,113 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, #define for_each_readable_member(c, ca) \ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) -/* - * If a key exists that references a device, the device won't be going away and - * we can omit rcu_read_lock(): - */ -static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) +static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) { - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + return dev < c->sb.nr_devices && c->devs[dev]; +} - return rcu_dereference_check(c->devs[idx], 1); +static inline bool bucket_valid(const struct bch_dev *ca, u64 b) +{ + return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first; } -static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) +static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev) { - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + EBUG_ON(!bch2_dev_exists(c, dev)); + + return rcu_dereference_check(c->devs[dev], 1); +} - return rcu_dereference_protected(c->devs[idx], +static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev) +{ + EBUG_ON(!bch2_dev_exists(c, dev)); + + return rcu_dereference_protected(c->devs[dev], lockdep_is_held(&c->sb_lock) || lockdep_is_held(&c->state_lock)); } +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) +{ + return c && dev < c->sb.nr_devices + ? rcu_dereference(c->devs[dev]) + : NULL; +} + +static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + if (ca) + bch2_dev_get(ca); + rcu_read_unlock(); + return ca; +} + +void bch2_dev_missing(struct bch_fs *, unsigned); + +static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); + if (!ca) + bch2_dev_missing(c, dev); + return ca; +} + +static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); + if (ca && !bucket_valid(ca, bucket.offset)) { + bch2_dev_put(ca); + ca = NULL; + } + return ca; +} + +void bch2_dev_bucket_missing(struct bch_fs *, struct bpos); + +static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket); + if (!ca) + bch2_dev_bucket_missing(c, bucket); + return ca; +} + +static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) +{ + if (ca && ca->dev_idx == dev_idx) + return ca; + bch2_dev_put(ca); + return bch2_dev_tryget_noerror(c, dev_idx); +} + +static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) +{ + if (ca && ca->dev_idx == dev_idx) + return ca; + bch2_dev_put(ca); + return bch2_dev_tryget(c, dev_idx); +} + +static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + if (ca && !percpu_ref_tryget(&ca->io_ref)) + ca = NULL; + rcu_read_unlock(); + + if (ca && + (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))) + return ca; + + if (ca) + percpu_ref_put(&ca->io_ref); + return NULL; +} + /* XXX kill, move to struct bch_fs */ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) { @@ -192,16 +293,16 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; -static inline bool bch2_member_exists(struct bch_member *m) +static inline bool bch2_member_alive(struct bch_member *m) { return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); } -static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev) +static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) { if (dev < sb->nr_devices) { struct bch_member m = bch2_sb_member_get(sb, dev); - return bch2_member_exists(&m); + return bch2_member_alive(&m); } return false; } @@ -210,6 +311,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) { return (struct bch_member_cpu) { .nbuckets = le64_to_cpu(mi->nbuckets), + .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) - + le16_to_cpu(mi->first_bucket), .first_bucket = le16_to_cpu(mi->first_bucket), .bucket_size = le16_to_cpu(mi->bucket_size), .group = BCH_MEMBER_GROUP(mi), @@ -220,7 +323,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .valid = bch2_member_exists(mi), + .valid = bch2_member_alive(mi), .btree_bitmap_shift = mi->btree_bitmap_shift, .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), }; diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h new file mode 100644 index 000000000000..c0eda888fe39 --- /dev/null +++ b/fs/bcachefs/sb-members_types.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H +#define _BCACHEFS_SB_MEMBERS_TYPES_H + +struct bch_member_cpu { + u64 nbuckets; /* device size */ + u64 nbuckets_minus_first; + u16 first_bucket; /* index of first bucket used */ + u16 bucket_size; /* sectors */ + u16 group; + u8 state; + u8 discard; + u8 data_allowed; + u8 durability; + u8 freespace_initialized; + u8 valid; + u8 btree_bitmap_shift; + u64 btree_allocated_bitmap; +}; + +#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 544322d5c251..629900a5e641 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -32,7 +32,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -49,7 +49,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot_tree *s) { int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot_tree, s); + BTREE_ITER_with_updates, snapshot_tree, s); if (bch2_err_matches(ret, ENOENT)) ret = -BCH_ERR_ENOENT_snapshot_tree; @@ -223,7 +223,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_snapshot s; @@ -298,7 +298,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct snapshot_t *t; @@ -352,7 +352,7 @@ err: int bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); } @@ -361,7 +361,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s) { return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot, s); + BTREE_ITER_with_updates, snapshot, s); } static int bch2_snapshot_live(struct btree_trans *trans, u32 id) @@ -618,7 +618,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot_tree(trans, &iter, k))); bch_err_fn(c, ret); @@ -695,7 +695,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, root = bch2_bkey_get_iter_typed(trans, &root_iter, BTREE_ID_snapshots, POS(0, root_id), - BTREE_ITER_WITH_UPDATES, snapshot); + BTREE_ITER_with_updates, snapshot); ret = bkey_err(root); if (ret) goto err; @@ -886,7 +886,7 @@ int bch2_check_snapshots(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot(trans, &iter, k))); bch_err_fn(c, ret); @@ -900,7 +900,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) if (bch2_snapshot_equiv(c, id)) return 0; - u32 tree_id; + /* 0 is an invalid tree ID */ + u32 tree_id = 0; int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); if (ret) return ret; @@ -1001,7 +1002,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) r.btree = btree; ret = for_each_btree_key(trans, iter, btree, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ get_snapshot_trees(c, &r, k.k->p); })); if (ret) @@ -1018,7 +1019,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) darray_for_each(*t, id) { if (fsck_err_on(!bch2_snapshot_equiv(c, *id), c, snapshot_node_missing, - "snapshot node %u from tree %s missing", *id, buf.buf)) { + "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); ret = -BCH_ERR_fsck_repair_unimplemented; @@ -1090,7 +1091,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) int ret = 0; s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_INTENT, snapshot); + BTREE_ITER_intent, snapshot); ret = bkey_err(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); @@ -1199,7 +1200,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_INTENT); + POS_MIN, BTREE_ITER_intent); k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) @@ -1367,7 +1368,7 @@ static int snapshot_delete_key(struct btree_trans *trans, if (snapshot_list_has_id(deleted, k.k->p.snapshot) || snapshot_list_has_id(equiv_seen, equiv)) { return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } else { return snapshot_list_add(c, equiv_seen, equiv); } @@ -1404,15 +1405,15 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans, new->k.p.snapshot = equiv; bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); + BTREE_ITER_all_snapshots| + BTREE_ITER_cached| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&new_iter) ?: bch2_trans_update(trans, &new_iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &new_iter); if (ret) return ret; @@ -1603,12 +1604,12 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, move_key_to_correct_snapshot(trans, &iter, k)); @@ -1643,7 +1644,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) * nodes some depth fields will be off: */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); if (ret) @@ -1699,8 +1700,8 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, id, pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); while (1) { k = bch2_btree_iter_prev(&iter); ret = bkey_err(k); @@ -1752,7 +1753,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, pos.snapshot = leaf_id; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index b7d2fed37c4f..bd5d74269d15 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -2,11 +2,11 @@ #ifndef _BCACHEFS_SNAPSHOT_H #define _BCACHEFS_SNAPSHOT_H -enum bkey_invalid_flags; +enum bch_validate_flags; void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_tree_invalid, \ @@ -20,9 +20,10 @@ int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tre void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_invalid, \ @@ -77,7 +78,7 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) return 0; u32 parent = s->parent; - if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) && + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && parent && s->depth != snapshot_t(c, parent)->depth + 1) panic("id %u depth=%u parent %u depth=%u\n", @@ -135,11 +136,6 @@ static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) return id; } -static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) -{ - return id == bch2_snapshot_equiv(c, id); -} - static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { rcu_read_lock(); diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 3976f80721bf..cbad9b27874f 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -15,16 +15,6 @@ #include <crypto/hash.h> #include <crypto/sha2.h> -typedef unsigned __bitwise bch_str_hash_flags_t; - -enum bch_str_hash_flags { - __BCH_HASH_SET_MUST_CREATE, - __BCH_HASH_SET_MUST_REPLACE, -}; - -#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE) -#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE) - static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { @@ -159,13 +149,14 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s desc.is_visible(inum, k)); } -static __always_inline int +static __always_inline struct bkey_s_c bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags, u32 snapshot) + enum btree_iter_update_trigger_flags flags, + u32 snapshot) { struct bkey_s_c k; int ret; @@ -173,10 +164,10 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), - BTREE_ITER_SLOTS|flags, k, ret) { + BTREE_ITER_slots|flags, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_key(k, key)) - return 0; + return k; } else if (k.k->type == KEY_TYPE_hash_whiteout) { ; } else { @@ -186,20 +177,23 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, } bch2_trans_iter_exit(trans, iter); - return ret ?: -BCH_ERR_ENOENT_str_hash_lookup; + return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); } -static __always_inline int +static __always_inline struct bkey_s_c bch2_hash_lookup(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { u32 snapshot; - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); + int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return bkey_s_c_err(ret); + + return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); } static __always_inline int @@ -220,7 +214,7 @@ bch2_hash_hole(struct btree_trans *trans, for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) if (!is_visible_key(desc, inum, k)) return 0; bch2_trans_iter_exit(trans, iter); @@ -242,7 +236,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, bch2_btree_iter_advance(&iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { + for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; @@ -264,8 +258,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, - bch_str_hash_flags_t str_hash_flags, - int update_flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter, slot = { NULL }; struct bkey_s_c k; @@ -277,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), POS(insert->k.p.inode, U64_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; @@ -286,8 +279,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, continue; } - if (!slot.path && - !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) + if (!slot.path && !(flags & STR_HASH_must_replace)) bch2_trans_copy_iter(&slot, &iter); if (k.k->type != KEY_TYPE_hash_whiteout) @@ -305,16 +297,16 @@ found: found = true; not_found: - if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) { + if (!found && (flags & STR_HASH_must_replace)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) { + } else if (found && (flags & STR_HASH_must_create)) { ret = -EEXIST; } else { if (!found && slot.path) swap(iter, slot); insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, update_flags); + ret = bch2_trans_update(trans, &iter, insert, flags); } goto out; @@ -326,14 +318,14 @@ int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, struct bkey_i *insert, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { insert->k.p.inode = inum.inum; u32 snapshot; return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: bch2_hash_set_in_snapshot(trans, desc, info, inum, - snapshot, insert, str_hash_flags, 0); + snapshot, insert, flags); } static __always_inline @@ -341,7 +333,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, struct btree_iter *iter, - unsigned update_flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_i *delete; int ret; @@ -359,7 +351,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - return bch2_trans_update(trans, iter, delete, update_flags); + return bch2_trans_update(trans, iter, delete, flags); } static __always_inline @@ -369,14 +361,10 @@ int bch2_hash_delete(struct btree_trans *trans, subvol_inum inum, const void *key) { struct btree_iter iter; - int ret; - - ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, - BTREE_ITER_INTENT); - if (ret) - return ret; - - ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key, + BTREE_ITER_intent); + int ret = bkey_err(k) ?: + bch2_hash_delete_at(trans, desc, info, &iter, 0); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 88a79c823276..132213761ef6 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -162,7 +162,7 @@ int bch2_check_subvols(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol(trans, &iter, k))); bch_err_fn(c, ret); @@ -198,7 +198,7 @@ int bch2_check_subvol_children(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol_child(trans, &iter, k))); bch_err_fn(c, ret); @@ -208,7 +208,7 @@ int bch2_check_subvol_children(struct bch_fs *c) /* Subvolumes: */ int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -245,9 +245,9 @@ static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bo int bch2_subvolume_trigger(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct bpos children_pos_old = subvolume_children_pos(old); struct bpos children_pos_new = subvolume_children_pos(new.s_c); @@ -333,7 +333,7 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, subvol = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES, + BTREE_ITER_cached|BTREE_ITER_with_updates, subvolume); ret = bkey_err(subvol); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -383,9 +383,9 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d return lockrestart_do(trans, bch2_subvolume_get(trans, subvolid_to_delete, true, - BTREE_ITER_CACHED, &s)) ?: + BTREE_ITER_cached, &s)) ?: for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, subvolid_to_delete, le32_to_cpu(s.creation_parent))); @@ -404,7 +404,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) subvol = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED|BTREE_ITER_INTENT, + BTREE_ITER_cached|BTREE_ITER_intent, subvolume); ret = bkey_err(subvol); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -505,7 +505,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) n = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(n); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -547,7 +547,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, BTREE_ID_subvolumes, POS(0, src_subvolid), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(src_subvol); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index d2015d549bd2..afa5e871efb2 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -5,16 +5,17 @@ #include "darray.h" #include "subvolume_types.h" -enum bkey_invalid_flags; +enum bch_validate_flags; int bch2_check_subvols(struct bch_fs *); int bch2_check_subvol_children(struct bch_fs *); int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ .key_invalid = bch2_subvolume_invalid, \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index bfdb15e7d778..f1bee6c5222d 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -76,7 +76,7 @@ const char * const bch2_sb_fields[] = { }; static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, - struct printbuf *); + enum bch_validate_flags, struct printbuf *); struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, enum bch_sb_field_type type) @@ -344,8 +344,8 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, - int rw) +static int bch2_sb_validate(struct bch_sb_handle *disk_sb, + enum bch_validate_flags flags, struct printbuf *out) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; @@ -401,7 +401,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return -BCH_ERR_invalid_sb_time_precision; } - if (rw == READ) { + if (!flags) { /* * Been seeing a bug where these are getting inexplicably * zeroed, so we're now validating them, but we have to be @@ -457,7 +457,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return -BCH_ERR_invalid_sb_members_missing; } - ret = bch2_sb_field_validate(sb, &mi->field, out); + ret = bch2_sb_field_validate(sb, &mi->field, flags, out); if (ret) return ret; @@ -465,12 +465,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) continue; - ret = bch2_sb_field_validate(sb, f, out); + ret = bch2_sb_field_validate(sb, f, flags, out); if (ret) return ret; } - if (rw == WRITE && + if ((flags & BCH_VALIDATE_write) && bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), @@ -819,7 +819,7 @@ got_super: sb->have_layout = true; - ret = bch2_sb_validate(sb, &err, READ); + ret = bch2_sb_validate(sb, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -975,7 +975,7 @@ int bch2_write_super(struct bch_fs *c) darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&(*ca)->disk_sb, &err, WRITE); + ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; @@ -1020,26 +1020,35 @@ int bch2_write_super(struct bch_fs *c) continue; if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { - bch2_fs_fatal_error(c, + struct printbuf buf = PRINTBUF; + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, ": Superblock write was silently dropped! (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); - percpu_ref_put(&ca->io_ref); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_erofs_sb_err; - goto out; } if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { - bch2_fs_fatal_error(c, + struct printbuf buf = PRINTBUF; + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, ": Superblock modified by another process (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); - percpu_ref_put(&ca->io_ref); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_erofs_sb_err; - goto out; } } + if (ret) + goto out; + do { wrote = false; darray_for_each(online_devices, cap) { @@ -1152,7 +1161,7 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) } static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { if (vstruct_bytes(f) < 88) { prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88); @@ -1167,8 +1176,7 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_ext *e = field_to_type(f, ext); - prt_printf(out, "Recovery passes required:"); - prt_tab(out); + prt_printf(out, "Recovery passes required:\t"); prt_bitflags(out, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0]))); prt_newline(out); @@ -1177,16 +1185,14 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, if (errors_silent) { le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); - prt_printf(out, "Errors to silently fix:"); - prt_tab(out); + prt_printf(out, "Errors to silently fix:\t"); prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8); prt_newline(out); kfree(errors_silent); } - prt_printf(out, "Btrees with missing data:"); - prt_tab(out); + prt_printf(out, "Btrees with missing data:\t"); prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); prt_newline(out); } @@ -1213,14 +1219,14 @@ static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) } static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { unsigned type = le32_to_cpu(f->type); struct printbuf field_err = PRINTBUF; const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); int ret; - ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; + ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0; if (ret) { prt_printf(err, "Invalid superblock section %s: %s", bch2_sb_fields[type], field_err.buf); @@ -1294,97 +1300,73 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, printbuf_tabstop_push(out, 44); for (int i = 0; i < sb->nr_devices; i++) - nr_devices += bch2_dev_exists(sb, i); + nr_devices += bch2_member_exists(sb, i); - prt_printf(out, "External UUID:"); - prt_tab(out); + prt_printf(out, "External UUID:\t"); pr_uuid(out, sb->user_uuid.b); prt_newline(out); - prt_printf(out, "Internal UUID:"); - prt_tab(out); + prt_printf(out, "Internal UUID:\t"); pr_uuid(out, sb->uuid.b); prt_newline(out); - prt_printf(out, "Magic number:"); - prt_tab(out); + prt_printf(out, "Magic number:\t"); pr_uuid(out, sb->magic.b); prt_newline(out); - prt_str(out, "Device index:"); - prt_tab(out); - prt_printf(out, "%u", sb->dev_idx); - prt_newline(out); + prt_printf(out, "Device index:\t%u\n", sb->dev_idx); - prt_str(out, "Label:"); - prt_tab(out); + prt_str(out, "Label:\t"); prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); prt_newline(out); - prt_str(out, "Version:"); - prt_tab(out); + prt_str(out, "Version:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version)); prt_newline(out); - prt_str(out, "Version upgrade complete:"); - prt_tab(out); + prt_str(out, "Version upgrade complete:\t"); bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); prt_newline(out); - prt_printf(out, "Oldest version on disk:"); - prt_tab(out); + prt_printf(out, "Oldest version on disk:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version_min)); prt_newline(out); - prt_printf(out, "Created:"); - prt_tab(out); + prt_printf(out, "Created:\t"); if (sb->time_base_lo) bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); else prt_printf(out, "(not set)"); prt_newline(out); - prt_printf(out, "Sequence number:"); - prt_tab(out); + prt_printf(out, "Sequence number:\t"); prt_printf(out, "%llu", le64_to_cpu(sb->seq)); prt_newline(out); - prt_printf(out, "Time of last write:"); - prt_tab(out); + prt_printf(out, "Time of last write:\t"); bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); prt_newline(out); - prt_printf(out, "Superblock size:"); - prt_tab(out); + prt_printf(out, "Superblock size:\t"); prt_units_u64(out, vstruct_bytes(sb)); prt_str(out, "/"); prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); prt_newline(out); - prt_printf(out, "Clean:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); - prt_newline(out); - - prt_printf(out, "Devices:"); - prt_tab(out); - prt_printf(out, "%u", nr_devices); - prt_newline(out); + prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb)); + prt_printf(out, "Devices:\t%u\n", nr_devices); - prt_printf(out, "Sections:"); + prt_printf(out, "Sections:\t"); vstruct_for_each(sb, f) fields_have |= 1 << le32_to_cpu(f->type); - prt_tab(out); prt_bitflags(out, bch2_sb_fields, fields_have); prt_newline(out); - prt_printf(out, "Features:"); - prt_tab(out); + prt_printf(out, "Features:\t"); prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); prt_newline(out); - prt_printf(out, "Compat features:"); - prt_tab(out); + prt_printf(out, "Compat features:\t"); prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); prt_newline(out); @@ -1401,8 +1383,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, if (opt->get_sb != BCH2_NO_SB_OPT) { u64 v = bch2_opt_from_sb(sb, id); - prt_printf(out, "%s:", opt->attr.name); - prt_tab(out); + prt_printf(out, "%s:\t", opt->attr.name); bch2_opt_to_text(out, NULL, sb, opt, v, OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); prt_newline(out); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 95e80e06316b..fadd364e2802 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -51,7 +51,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); extern const char * const bch2_sb_fields[]; struct bch_sb_field_ops { - int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); + int (*validate)(struct bch_sb *, struct bch_sb_field *, + enum bch_validate_flags, struct printbuf *); void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); }; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index dddf57ec4511..294a9d35a9f2 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -264,7 +264,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_open_buckets_stop(c, NULL, true); bch2_rebalance_stop(c); bch2_copygc_stop(c); - bch2_gc_thread_stop(c); bch2_fs_ec_flush(c); bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", @@ -285,7 +284,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", journal_cur_seq(&c->journal)); - if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && + if (test_bit(JOURNAL_replay_done, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); @@ -467,7 +466,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail: */ - set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); + set_bit(JOURNAL_need_flush_write, &c->journal.flags); + set_bit(JOURNAL_running, &c->journal.flags); for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); @@ -485,12 +485,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) } #endif - ret = bch2_gc_thread_start(c); - if (ret) { - bch_err(c, "error starting gc thread"); - return ret; - } - ret = bch2_journal_reclaim_start(&c->journal); if (ret) goto err; @@ -537,9 +531,7 @@ int bch2_fs_read_write_early(struct bch_fs *c) static void __bch2_fs_free(struct bch_fs *c) { - unsigned i; - - for (i = 0; i < BCH_TIME_STAT_NR; i++) + for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); bch2_find_btree_nodes_exit(&c->found_btree_nodes); @@ -572,6 +564,7 @@ static void __bch2_fs_free(struct bch_fs *c) BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); + EBUG_ON(percpu_u64_get(c->online_reserved)); free_percpu(c->online_reserved); darray_exit(&c->btree_roots_extra); @@ -616,8 +609,6 @@ void __bch2_fs_stop(struct bch_fs *c) set_bit(BCH_FS_stopping, &c->flags); - cancel_work_sync(&c->journal_seq_blacklist_gc_work); - down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); @@ -665,6 +656,7 @@ void bch2_fs_free(struct bch_fs *c) struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { + EBUG_ON(atomic_long_read(&ca->ref) != 1); bch2_free_super(&ca->disk_sb); bch2_dev_free(ca); } @@ -719,7 +711,7 @@ static int bch2_fs_online(struct bch_fs *c) ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); goto err; } } @@ -778,6 +770,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); + bch2_fs_gc_init(c); bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); @@ -800,16 +793,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) spin_lock_init(&c->btree_write_error_lock); - INIT_WORK(&c->journal_seq_blacklist_gc_work, - bch2_blacklist_entries_gc); - INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_error_msgs); mutex_init(&c->fsck_error_msgs_lock); - seqcount_init(&c->gc_pos_lock); - seqcount_init(&c->usage_lock); sema_init(&c->io_in_flight, 128); @@ -940,7 +928,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; for (i = 0; i < c->sb.nr_devices; i++) - if (bch2_dev_exists(c->disk_sb.sb, i) && + if (bch2_member_exists(c->disk_sb.sb, i) && bch2_dev_alloc(c, i)) { ret = -EEXIST; goto err; @@ -1101,7 +1089,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; - if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx)) + if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; if (fs->sb->block_size != sb->sb->block_size) @@ -1200,11 +1188,11 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); + kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); - bioset_exit(&ca->replica_set); bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); @@ -1212,7 +1200,9 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); +#ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); +#endif kobject_put(&ca->kobj); } @@ -1239,12 +1229,14 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) bch2_dev_journal_exit(ca); } +#ifndef CONFIG_BCACHEFS_DEBUG static void bch2_dev_ref_complete(struct percpu_ref *ref) { struct bch_dev *ca = container_of(ref, struct bch_dev, ref); complete(&ca->ref_completion); } +#endif static void bch2_dev_io_ref_complete(struct percpu_ref *ref) { @@ -1313,14 +1305,17 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / btree_sectors(c)); - if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, - 0, GFP_KERNEL) || - percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, +#ifndef CONFIG_BCACHEFS_DEBUG + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) + goto err; +#else + atomic_long_set(&ca->ref, 1); +#endif + + if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || - bioset_init(&ca->replica_set, 4, - offsetof(struct bch_write_bio, bio), 0) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; @@ -1411,10 +1406,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) le64_to_cpu(c->disk_sb.sb->seq)) bch2_sb_to_fs(c, sb->sb); - BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || - !c->devs[sb->sb->dev_idx]); + BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); - ca = bch_dev_locked(c, sb->sb->dev_idx); + ca = bch2_dev_locked(c, sb->sb->dev_idx); ret = __bch2_dev_attach_bdev(ca, sb); if (ret) @@ -1506,10 +1500,10 @@ static bool bch2_fs_may_start(struct bch_fs *c) mutex_lock(&c->sb_lock); for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_dev_exists(c->disk_sb.sb, i)) + if (!bch2_member_exists(c->disk_sb.sb, i)) continue; - ca = bch_dev_locked(c, i); + ca = bch2_dev_locked(c, i); if (!bch2_dev_is_online(ca) && (ca->mi.state == BCH_MEMBER_STATE_rw || @@ -1599,17 +1593,17 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * with bch2_do_invalidates() and bch2_do_discards() */ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_NORUN, NULL); + BTREE_TRIGGER_norun, NULL); bch_err_msg(c, ret, "removing dev alloc info"); return ret; } @@ -1626,7 +1620,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * We consume a reference to ca->ref, regardless of whether we succeed * or fail: */ - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); @@ -1678,7 +1672,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) rcu_assign_pointer(c->devs[ca->dev_idx], NULL); mutex_unlock(&c->sb_lock); +#ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_kill(&ca->ref); +#else + ca->dying = true; + bch2_dev_put(ca); +#endif wait_for_completion(&ca->ref_completion); bch2_dev_free(ca); @@ -1777,9 +1776,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (dynamic_fault("bcachefs:add:no_slot")) goto no_slot; - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) - if (!bch2_dev_exists(c->disk_sb.sb, dev_idx)) - goto have_slot; + if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) { + dev_idx = c->sb.nr_devices; + goto have_slot; + } + + int best = -1; + u64 best_last_mount = 0; + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); + if (bch2_member_alive(&m)) + continue; + + u64 last_mount = le64_to_cpu(m.last_mount); + if (best < 0 || last_mount < best_last_mount) { + best = dev_idx; + best_last_mount = last_mount; + } + } + if (best >= 0) { + dev_idx = best; + goto have_slot; + } no_slot: ret = -BCH_ERR_ENOSPC_sb_members; bch_err_msg(c, ret, "setting up new superblock"); @@ -1821,7 +1839,7 @@ have_slot: bch2_dev_usage_journal_reserve(c); - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(ca, ret, "marking new superblock"); if (ret) goto err_late; @@ -1884,9 +1902,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path) if (ret) goto err; - ca = bch_dev_locked(c, dev_idx); + ca = bch2_dev_locked(c, dev_idx); - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); if (ret) goto err; @@ -1979,7 +1997,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (ret) goto err; - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); if (ret) goto err; diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 11bcef170c2c..368a63d938cf 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -26,19 +26,4 @@ struct bch_devs_list { u8 data[BCH_BKEY_PTRS_MAX]; }; -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u16 group; - u8 state; - u8 discard; - u8 data_allowed; - u8 durability; - u8 freespace_initialized; - u8 valid; - u8 btree_bitmap_shift; - u64 btree_allocated_bitmap; -}; - #endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 5be92fe3f4ea..93ca74d108b1 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -140,9 +140,8 @@ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); write_attribute(trigger_journal_flush); -write_attribute(prune_cache); -write_attribute(btree_wakeup); -rw_attribute(btree_gc_periodic); +write_attribute(trigger_btree_cache_shrink); +write_attribute(trigger_btree_key_cache_shrink); rw_attribute(gc_gens_pos); read_attribute(uuid); @@ -189,12 +188,8 @@ static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) { bch2_printbuf_tabstop_push(out, 24); - for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) { - prt_str(out, bch2_write_refs[i]); - prt_tab(out); - prt_printf(out, "%li", atomic_long_read(&c->writes[i])); - prt_newline(out); - } + for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) + prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); } #endif @@ -278,7 +273,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c continue; ret = for_each_btree_key(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_all_snapshots, k, ({ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *entry; @@ -313,22 +308,11 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (ret) return ret; - prt_str(out, "type"); printbuf_tabstop_push(out, 12); - prt_tab(out); - - prt_str(out, "compressed"); printbuf_tabstop_push(out, 16); - prt_tab_rjust(out); - - prt_str(out, "uncompressed"); printbuf_tabstop_push(out, 16); - prt_tab_rjust(out); - - prt_str(out, "average extent size"); printbuf_tabstop_push(out, 24); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { bch2_prt_compression_type(out, i); @@ -362,21 +346,6 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "\n"); } -static void bch2_btree_wakeup_all(struct bch_fs *c) -{ - struct btree_trans *trans; - - seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); - - if (b) - six_lock_wakeup_all(&b->lock); - - } - seqmutex_unlock(&c->btree_trans_lock); -} - SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -392,8 +361,6 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_write_stats) bch2_btree_write_stats_to_text(out, c); - sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); - if (attr == &sysfs_gc_gens_pos) bch2_gc_gens_pos_to_text(out, c); @@ -416,7 +383,7 @@ SHOW(bch2_fs) bch2_journal_debug_to_text(out, &c->journal); if (attr == &sysfs_btree_cache) - bch2_btree_cache_to_text(out, c); + bch2_btree_cache_to_text(out, &c->btree_cache); if (attr == &sysfs_btree_key_cache) bch2_btree_key_cache_to_text(out, &c->btree_key_cache); @@ -459,6 +426,9 @@ SHOW(bch2_fs) if (attr == &sysfs_disk_groups) bch2_disk_groups_to_text(out, c); + if (attr == &sysfs_alloc_debug) + bch2_fs_alloc_debug_to_text(out, c); + return 0; } @@ -466,14 +436,6 @@ STORE(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - if (attr == &sysfs_btree_gc_periodic) { - ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) - ?: (ssize_t) size; - - wake_up_process(c->gc_thread); - return ret; - } - if (attr == &sysfs_copy_gc_enabled) { ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ?: (ssize_t) size; @@ -505,7 +467,7 @@ STORE(bch2_fs) if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; - if (attr == &sysfs_prune_cache) { + if (attr == &sysfs_trigger_btree_cache_shrink) { struct shrink_control sc; sc.gfp_mask = GFP_KERNEL; @@ -513,22 +475,17 @@ STORE(bch2_fs) c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); } - if (attr == &sysfs_btree_wakeup) - bch2_btree_wakeup_all(c); - - if (attr == &sysfs_trigger_gc) { - /* - * Full gc is currently incompatible with btree key cache: - */ -#if 0 - down_read(&c->state_lock); - bch2_gc(c, false, false); - up_read(&c->state_lock); -#else - bch2_gc_gens(c); -#endif + if (attr == &sysfs_trigger_btree_key_cache_shrink) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); } + if (attr == &sysfs_trigger_gc) + bch2_gc_gens(c); + if (attr == &sysfs_trigger_discards) bch2_do_discards(c); @@ -594,13 +551,11 @@ SHOW(bch2_fs_counters) if (attr == &sysfs_##t) { \ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ - prt_printf(out, "since mount:"); \ - prt_tab(out); \ + prt_printf(out, "since mount:\t"); \ prt_human_readable_u64(out, counter_since_mount); \ prt_newline(out); \ \ - prt_printf(out, "since filesystem creation:"); \ - prt_tab(out); \ + prt_printf(out, "since filesystem creation:\t"); \ prt_human_readable_u64(out, counter); \ prt_newline(out); \ } @@ -660,8 +615,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_discards, &sysfs_trigger_invalidates, &sysfs_trigger_journal_flush, - &sysfs_prune_cache, - &sysfs_btree_wakeup, + &sysfs_trigger_btree_cache_shrink, + &sysfs_trigger_btree_key_cache_shrink, &sysfs_gc_gens_pos, @@ -677,6 +632,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_internal_uuid, &sysfs_disk_groups, + &sysfs_alloc_debug, NULL }; @@ -792,88 +748,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(ca); - unsigned i, nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - printbuf_tabstop_push(out, 8); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - - bch2_dev_usage_to_text(out, &stats); - - prt_newline(out); - - prt_printf(out, "reserves:"); - prt_newline(out); - for (i = 0; i < BCH_WATERMARK_NR; i++) { - prt_str(out, bch2_watermarks[i]); - prt_tab(out); - prt_u64(out, bch2_dev_buckets_reserved(ca, i)); - prt_tab_rjust(out); - prt_newline(out); - } - - prt_newline(out); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 24); - - prt_str(out, "freelist_wait"); - prt_tab(out); - prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty"); - prt_newline(out); - - prt_str(out, "open buckets allocated"); - prt_tab(out); - prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); - prt_newline(out); - - prt_str(out, "open buckets this dev"); - prt_tab(out); - prt_u64(out, ca->nr_open_buckets); - prt_newline(out); - - prt_str(out, "open buckets total"); - prt_tab(out); - prt_u64(out, OPEN_BUCKETS_COUNT); - prt_newline(out); - - prt_str(out, "open_buckets_wait"); - prt_tab(out); - prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty"); - prt_newline(out); - - prt_str(out, "open_buckets_btree"); - prt_tab(out); - prt_u64(out, nr[BCH_DATA_btree]); - prt_newline(out); - - prt_str(out, "open_buckets_user"); - prt_tab(out); - prt_u64(out, nr[BCH_DATA_user]); - prt_newline(out); - - prt_str(out, "buckets_to_invalidate"); - prt_tab(out); - prt_u64(out, should_invalidate_buckets(ca, stats)); - prt_newline(out); - - prt_str(out, "btree reserve cache"); - prt_tab(out); - prt_u64(out, c->btree_reserve_cache_nr); - prt_newline(out); -} - static const char * const bch2_rw[] = { "read", "write", @@ -943,7 +817,7 @@ SHOW(bch2_dev) * 100 / CONGESTED_MAX); if (attr == &sysfs_alloc_debug) - dev_alloc_debug_to_text(out, ca); + bch2_dev_alloc_debug_to_text(out, ca); return 0; } diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index bfec656f94c0..68104b2056d9 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -40,7 +40,7 @@ static int test_delete(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: @@ -81,7 +81,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: @@ -261,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ret = bch2_trans_run(c, for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ + BTREE_ITER_slots, k, ({ if (i >= nr * 2) break; @@ -322,7 +322,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ret = bch2_trans_run(c, for_each_btree_key_upto(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ + BTREE_ITER_slots, k, ({ if (i == nr) break; BUG_ON(bkey_deleted(k.k) != !(i % 16)); @@ -452,7 +452,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + BTREE_UPDATE_internal_snapshot_node)); bch_err_fn(c, ret); return ret; } @@ -671,7 +671,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) @@ -714,7 +714,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, + BTREE_ITER_slots|BTREE_ITER_intent, k, NULL, NULL, 0, ({ if (iter.pos.offset >= nr) break; @@ -737,7 +737,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, 0, ({ struct bkey_i_cookie u; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 6aa81d1e6d36..84fcf26e306e 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(fs_str, TP_fast_assign( __entry->dev = c->dev; - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) @@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(trans_str, __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d %s %pS %s", @@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller, TP_fast_assign( __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d %s %s", @@ -638,99 +638,14 @@ DEFINE_EVENT(bch_fs, gc_gens_end, /* Allocator */ -DECLARE_EVENT_CLASS(bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err), - - TP_STRUCT__entry( - __field(u8, dev ) - __array(char, reserve, 16 ) - __field(u64, bucket ) - __field(u64, free ) - __field(u64, avail ) - __field(u64, copygc_wait_amount ) - __field(s64, copygc_waiting_for ) - __field(u64, seen ) - __field(u64, open ) - __field(u64, need_journal_commit ) - __field(u64, nouse ) - __field(bool, nonblocking ) - __field(u64, nocow ) - __array(char, err, 32 ) - ), - - TP_fast_assign( - __entry->dev = ca->dev_idx; - strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); - __entry->bucket = bucket; - __entry->free = free; - __entry->avail = avail; - __entry->copygc_wait_amount = copygc_wait_amount; - __entry->copygc_waiting_for = copygc_waiting_for; - __entry->seen = s->buckets_seen; - __entry->open = s->skipped_open; - __entry->need_journal_commit = s->skipped_need_journal_commit; - __entry->nouse = s->skipped_nouse; - __entry->nonblocking = nonblocking; - __entry->nocow = s->skipped_nocow; - strscpy(__entry->err, err, sizeof(__entry->err)); - ), - - TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", - __entry->reserve, - __entry->dev, - __entry->bucket, - __entry->free, - __entry->avail, - __entry->copygc_wait_amount, - __entry->copygc_waiting_for, - __entry->seen, - __entry->open, - __entry->need_journal_commit, - __entry->nouse, - __entry->nocow, - __entry->nonblocking, - __entry->err) +DEFINE_EVENT(fs_str, bucket_alloc, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bucket_alloc, bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err) -); - -DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err) +DEFINE_EVENT(fs_str, bucket_alloc_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(discard_buckets, diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 92c6ad75e702..de331dec2a99 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -348,15 +348,12 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { const struct time_unit *u = bch2_pick_time_units(ns); - prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); - prt_tab_rjust(out); - prt_printf(out, "%s", u->name); + prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name); } static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) { - prt_str(out, name); - prt_tab(out); + prt_printf(out, "%s\t", name); bch2_pr_time_units_aligned(out, ns); prt_newline(out); } @@ -389,12 +386,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats } printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); - prt_printf(out, "count:"); - prt_tab(out); - prt_printf(out, "%llu ", - stats->duration_stats.n); + prt_printf(out, "count:\t%llu\n", stats->duration_stats.n); printbuf_tabstop_pop(out); - prt_newline(out); printbuf_tabstops_reset(out); @@ -403,13 +396,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, 0); printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - prt_tab(out); - prt_printf(out, "since mount"); - prt_tab_rjust(out); - prt_tab(out); + prt_printf(out, "\tsince mount\r\trecent\r\n"); prt_printf(out, "recent"); - prt_tab_rjust(out); - prt_newline(out); printbuf_tabstops_reset(out); printbuf_tabstop_push(out, out->indent + 20); @@ -417,23 +405,20 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, 2); printbuf_tabstop_push(out, TABSTOP_SIZE); - prt_printf(out, "duration of events"); - prt_newline(out); + prt_printf(out, "duration of events\n"); printbuf_indent_add(out, 2); pr_name_and_units(out, "min:", stats->min_duration); pr_name_and_units(out, "max:", stats->max_duration); pr_name_and_units(out, "total:", stats->total_duration); - prt_printf(out, "mean:"); - prt_tab(out); + prt_printf(out, "mean:\t"); bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); - prt_printf(out, "stddev:"); - prt_tab(out); + prt_printf(out, "stddev:\t"); bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); @@ -441,22 +426,19 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_indent_sub(out, 2); prt_newline(out); - prt_printf(out, "time between events"); - prt_newline(out); + prt_printf(out, "time between events\n"); printbuf_indent_add(out, 2); pr_name_and_units(out, "min:", stats->min_freq); pr_name_and_units(out, "max:", stats->max_freq); - prt_printf(out, "mean:"); - prt_tab(out); + prt_printf(out, "mean:\t"); bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); - prt_printf(out, "stddev:"); - prt_tab(out); + prt_printf(out, "stddev:\t"); bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); @@ -589,40 +571,31 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro if (!out->nr_tabstops) printbuf_tabstop_push(out, 20); - prt_printf(out, "rate:"); - prt_tab(out); + prt_printf(out, "rate:\t"); prt_human_readable_s64(out, pd->rate.rate); prt_newline(out); - prt_printf(out, "target:"); - prt_tab(out); + prt_printf(out, "target:\t"); prt_human_readable_u64(out, pd->last_target); prt_newline(out); - prt_printf(out, "actual:"); - prt_tab(out); + prt_printf(out, "actual:\t"); prt_human_readable_u64(out, pd->last_actual); prt_newline(out); - prt_printf(out, "proportional:"); - prt_tab(out); + prt_printf(out, "proportional:\t"); prt_human_readable_s64(out, pd->last_proportional); prt_newline(out); - prt_printf(out, "derivative:"); - prt_tab(out); + prt_printf(out, "derivative:\t"); prt_human_readable_s64(out, pd->last_derivative); prt_newline(out); - prt_printf(out, "change:"); - prt_tab(out); + prt_printf(out, "change:\t"); prt_human_readable_s64(out, pd->last_change); prt_newline(out); - prt_printf(out, "next io:"); - prt_tab(out); - prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); - prt_newline(out); + prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); } /* misc: */ diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 5cf885b09986..5d2c470a49ac 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -445,11 +445,6 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) void bch2_bio_map(struct bio *bio, void *base, size_t); int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); -static inline sector_t bdev_sectors(struct block_device *bdev) -{ - return bdev->bd_inode->i_size >> 9; -} - #define closure_bio_submit(bio, cl) \ do { \ closure_get(cl); \ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 754f17bba68e..c11bf6dacc2c 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { }; int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); @@ -118,11 +118,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, else prt_printf(out, "(unknown type %u)", xattr.v->x_type); + unsigned name_len = xattr.v->x_name_len; + unsigned val_len = le16_to_cpu(xattr.v->x_val_len); + unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - + offsetof(struct bch_xattr, x_name); + + val_len = min_t(int, val_len, max_name_val_bytes - name_len); + name_len = min(name_len, max_name_val_bytes); + prt_printf(out, "%.*s:%.*s", - xattr.v->x_name_len, - xattr.v->x_name, - le16_to_cpu(xattr.v->x_val_len), - (char *) xattr_val(xattr.v)); + name_len, xattr.v->x_name, + val_len, (char *) xattr_val(xattr.v)); if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { @@ -138,21 +144,13 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); struct btree_iter iter; - struct bkey_s_c_xattr xattr; - struct bkey_s_c k; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode_inum(inode), &search, 0); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, + inode_inum(inode), &search, 0); + int ret = bkey_err(k); if (ret) - goto err1; - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err2; + return ret; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ret = le16_to_cpu(xattr.v->x_val_len); if (buffer) { if (ret > size) @@ -160,10 +158,8 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info else memcpy(buffer, xattr_val(xattr.v), ret); } -err2: bch2_trans_iter_exit(trans, &iter); -err1: - return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret; + return ret; } int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, @@ -177,7 +173,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, int ret; ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); if (ret) return ret; @@ -212,8 +208,8 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, inum, &xattr->k_i, - (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| - (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); + (flags & XATTR_CREATE ? STR_HASH_must_create : 0)| + (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0)); } else { struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); @@ -359,6 +355,9 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, int ret = bch2_trans_do(c, NULL, NULL, 0, bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); + if (ret < 0 && bch2_err_matches(ret, ENOENT)) + ret = -ENODATA; + return bch2_err_class(ret); } diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 1337f31a5c49..1574b9eb4c85 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -7,7 +7,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b5a25ee49eea..a43897b03ce9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1934,7 +1934,7 @@ static void free_note_info(struct elf_note_info *info) threads = t->next; WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus); for (i = 1; i < info->thread_notes; ++i) - kfree(t->notes[i].data); + kvfree(t->notes[i].data); kfree(t); } kfree(info->psinfo.data); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7696beec4c21..7130040d92ab 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -316,7 +316,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->dev_stats_valid = 1; - set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_devices; ret = btrfs_get_dev_zone_info(device, false); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a91a8056758a..1b20b3e390df 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3656,7 +3656,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, struct btrfs_super_block *super; struct page *page; u64 bytenr, bytenr_orig; - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = bdev->bd_mapping; int ret; bytenr_orig = btrfs_sb_offset(copy_num); @@ -3743,7 +3743,7 @@ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; - struct address_space *mapping = device->bdev->bd_inode->i_mapping; + struct address_space *mapping = device->bdev->bd_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; int ret; @@ -3861,7 +3861,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) device->commit_total_bytes) break; - folio = filemap_get_folio(device->bdev->bd_inode->i_mapping, + folio = filemap_get_folio(device->bdev->bd_mapping, bytenr >> PAGE_SHIFT); /* If the folio has been removed, then we know it completed. */ if (IS_ERR(folio)) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b6a701011fb0..c39145e8c4ad 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -482,10 +482,12 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, if (flush) sync_blockdev(bdev); - ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE); - if (ret) { - fput(*bdev_file); - goto error; + if (holder) { + ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); + if (ret) { + fput(*bdev_file); + goto error; + } } invalidate_bdev(bdev); *disk_super = btrfs_read_dev_super(bdev); @@ -498,6 +500,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, return 0; error: + *disk_super = NULL; *bdev_file = NULL; return ret; } @@ -1287,7 +1290,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev return ERR_PTR(-EINVAL); /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); + page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); if (IS_ERR(page)) return ERR_CAST(page); @@ -2714,7 +2717,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->dev_stats_valid = 1; - set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { btrfs_clear_sb_rdonly(sb); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 4cba80b34387..9b43fa493219 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -118,7 +118,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, return -ENOENT; } else if (full[0] && full[1]) { /* Compare two super blocks */ - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = bdev->bd_mapping; struct page *page[BTRFS_NR_SB_LOG_ZONES]; struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; int i; diff --git a/fs/buffer.c b/fs/buffer.c index 4f73d23c2c46..8c19e705b9c3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -189,8 +189,8 @@ EXPORT_SYMBOL(end_buffer_write_sync); static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) { - struct inode *bd_inode = bdev->bd_inode; - struct address_space *bd_mapping = bd_inode->i_mapping; + struct address_space *bd_mapping = bdev->bd_mapping; + const int blkbits = bd_mapping->host->i_blkbits; struct buffer_head *ret = NULL; pgoff_t index; struct buffer_head *bh; @@ -199,7 +199,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) int all_mapped = 1; static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); - index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE; + index = ((loff_t)block << blkbits) / PAGE_SIZE; folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0); if (IS_ERR(folio)) goto out; @@ -233,7 +233,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) (unsigned long long)block, (unsigned long long)bh->b_blocknr, bh->b_state, bh->b_size, bdev, - 1 << bd_inode->i_blkbits); + 1 << blkbits); } out_unlock: spin_unlock(&bd_mapping->i_private_lock); @@ -687,30 +687,37 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) } EXPORT_SYMBOL(mark_buffer_dirty_inode); -/* - * Add a page to the dirty page list. - * - * It is a sad fact of life that this function is called from several places - * deeply under spinlocking. It may not sleep. - * - * If the page has buffers, the uptodate buffers are set dirty, to preserve - * dirty-state coherency between the page and the buffers. It the page does - * not have buffers then when they are later attached they will all be set - * dirty. - * - * The buffers are dirtied before the page is dirtied. There's a small race - * window in which a writepage caller may see the page cleanness but not the - * buffer dirtiness. That's fine. If this code were to set the page dirty - * before the buffers, a concurrent writepage caller could clear the page dirty - * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean - * page on the dirty page list. - * - * We use i_private_lock to lock against try_to_free_buffers while using the - * page's buffer list. Also use this to protect against clean buffers being - * added to the page after it was set dirty. - * - * FIXME: may need to call ->reservepage here as well. That's rather up to the - * address_space though. +/** + * block_dirty_folio - Mark a folio as dirty. + * @mapping: The address space containing this folio. + * @folio: The folio to mark dirty. + * + * Filesystems which use buffer_heads can use this function as their + * ->dirty_folio implementation. Some filesystems need to do a little + * work before calling this function. Filesystems which do not use + * buffer_heads should call filemap_dirty_folio() instead. + * + * If the folio has buffers, the uptodate buffers are set dirty, to + * preserve dirty-state coherency between the folio and the buffers. + * Buffers added to a dirty folio are created dirty. + * + * The buffers are dirtied before the folio is dirtied. There's a small + * race window in which writeback may see the folio cleanness but not the + * buffer dirtiness. That's fine. If this code were to set the folio + * dirty before the buffers, writeback could clear the folio dirty flag, + * see a bunch of clean buffers and we'd end up with dirty buffers/clean + * folio on the dirty folio list. + * + * We use i_private_lock to lock against try_to_free_buffers() while + * using the folio's buffer list. This also prevents clean buffers + * being added to the folio after it was set dirty. + * + * Context: May only be called from process context. Does not sleep. + * Caller must ensure that @folio cannot be truncated during this call, + * typically by holding the folio lock or having a page in the folio + * mapped and holding the page table lock. + * + * Return: True if the folio was dirtied; false if it was already dirtied. */ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) { @@ -1034,12 +1041,12 @@ static sector_t folio_init_buffers(struct folio *folio, static bool grow_dev_folio(struct block_device *bdev, sector_t block, pgoff_t index, unsigned size, gfp_t gfp) { - struct inode *inode = bdev->bd_inode; + struct address_space *mapping = bdev->bd_mapping; struct folio *folio; struct buffer_head *bh; sector_t end_block = 0; - folio = __filemap_get_folio(inode->i_mapping, index, + folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return false; @@ -1073,10 +1080,10 @@ static bool grow_dev_folio(struct block_device *bdev, sector_t block, * lock to be atomic wrt __find_get_block(), which does not * run under the folio lock. */ - spin_lock(&inode->i_mapping->i_private_lock); + spin_lock(&mapping->i_private_lock); link_dev_buffers(folio, bh); end_block = folio_init_buffers(folio, bdev, size); - spin_unlock(&inode->i_mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); unlock: folio_unlock(folio); folio_put(folio); @@ -1219,26 +1226,28 @@ void mark_buffer_write_io_error(struct buffer_head *bh) } EXPORT_SYMBOL(mark_buffer_write_io_error); -/* - * Decrement a buffer_head's reference count. If all buffers against a page - * have zero reference count, are clean and unlocked, and if the page is clean - * and unlocked then try_to_free_buffers() may strip the buffers from the page - * in preparation for freeing it (sometimes, rarely, buffers are removed from - * a page but it ends up not being freed, and buffers may later be reattached). +/** + * __brelse - Release a buffer. + * @bh: The buffer to release. + * + * This variant of brelse() can be called if @bh is guaranteed to not be NULL. */ -void __brelse(struct buffer_head * buf) +void __brelse(struct buffer_head *bh) { - if (atomic_read(&buf->b_count)) { - put_bh(buf); + if (atomic_read(&bh->b_count)) { + put_bh(bh); return; } WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); } EXPORT_SYMBOL(__brelse); -/* - * bforget() is like brelse(), except it discards any - * potentially dirty data. +/** + * __bforget - Discard any dirty data in a buffer. + * @bh: The buffer to forget. + * + * This variant of bforget() can be called if @bh is guaranteed to not + * be NULL. */ void __bforget(struct buffer_head *bh) { @@ -1415,6 +1424,11 @@ EXPORT_SYMBOL(__find_get_block); * @size: The size of buffer_heads for this @bdev. * @gfp: The memory allocation flags to use. * + * The returned buffer head has its reference count incremented, but is + * not locked. The caller should call brelse() when it has finished + * with the buffer. The buffer may not be uptodate. If needed, the + * caller can bring it uptodate either by reading it or overwriting it. + * * Return: The buffer head, or NULL if memory could not be allocated. */ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, @@ -1446,24 +1460,33 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__breadahead); /** - * __bread_gfp() - reads a specified block and returns the bh - * @bdev: the block_device to read from - * @block: number of block - * @size: size (in bytes) to read - * @gfp: page allocation flag - * - * Reads a specified block, and returns buffer head that contains it. - * The page cache can be allocated from non-movable area - * not to prevent page migration if you set gfp to zero. - * It returns NULL if the block was unreadable. + * __bread_gfp() - Read a block. + * @bdev: The block device to read from. + * @block: Block number in units of block size. + * @size: The block size of this device in bytes. + * @gfp: Not page allocation flags; see below. + * + * You are not expected to call this function. You should use one of + * sb_bread(), sb_bread_unmovable() or __bread(). + * + * Read a specified block, and return the buffer head that refers to it. + * If @gfp is 0, the memory will be allocated using the block device's + * default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be + * allocated from a movable area. Do not pass in a complete set of + * GFP flags. + * + * The returned buffer head has its refcount increased. The caller should + * call brelse() when it has finished with the buffer. + * + * Context: May sleep waiting for I/O. + * Return: NULL if the block was unreadable. */ -struct buffer_head * -__bread_gfp(struct block_device *bdev, sector_t block, - unsigned size, gfp_t gfp) +struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { struct buffer_head *bh; - gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); + gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS); /* * Prefer looping in the allocator rather than here, at least that @@ -1696,16 +1719,16 @@ EXPORT_SYMBOL(create_empty_buffers); */ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) { - struct inode *bd_inode = bdev->bd_inode; - struct address_space *bd_mapping = bd_inode->i_mapping; + struct address_space *bd_mapping = bdev->bd_mapping; + const int blkbits = bd_mapping->host->i_blkbits; struct folio_batch fbatch; - pgoff_t index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE; + pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE; pgoff_t end; int i, count; struct buffer_head *bh; struct buffer_head *head; - end = ((loff_t)(block + len - 1) << bd_inode->i_blkbits) / PAGE_SIZE; + end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE; folio_batch_init(&fbatch); while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) { count = folio_batch_count(&fbatch); @@ -2861,26 +2884,6 @@ int sync_dirty_buffer(struct buffer_head *bh) } EXPORT_SYMBOL(sync_dirty_buffer); -/* - * try_to_free_buffers() checks if all the buffers on this particular folio - * are unused, and releases them if so. - * - * Exclusion against try_to_free_buffers may be obtained by either - * locking the folio or by holding its mapping's i_private_lock. - * - * If the folio is dirty but all the buffers are clean then we need to - * be sure to mark the folio clean as well. This is because the folio - * may be against a block device, and a later reattachment of buffers - * to a dirty folio will set *all* buffers dirty. Which would corrupt - * filesystem data on the same device. - * - * The same applies to regular filesystem folios: if all the buffers are - * clean then we set the folio clean and proceed. To do that, we require - * total exclusion from block_dirty_folio(). That is obtained with - * i_private_lock. - * - * try_to_free_buffers() is non-blocking. - */ static inline int buffer_busy(struct buffer_head *bh) { return atomic_read(&bh->b_count) | @@ -2914,6 +2917,30 @@ failed: return false; } +/** + * try_to_free_buffers - Release buffers attached to this folio. + * @folio: The folio. + * + * If any buffers are in use (dirty, under writeback, elevated refcount), + * no buffers will be freed. + * + * If the folio is dirty but all the buffers are clean then we need to + * be sure to mark the folio clean as well. This is because the folio + * may be against a block device, and a later reattachment of buffers + * to a dirty folio will set *all* buffers dirty. Which would corrupt + * filesystem data on the same device. + * + * The same applies to regular filesystem folios: if all the buffers are + * clean then we set the folio clean and proceed. To do that, we require + * total exclusion from block_dirty_folio(). That is obtained with + * i_private_lock. + * + * Exclusion against try_to_free_buffers may be obtained by either + * locking the folio or by holding its mapping's i_private_lock. + * + * Context: Process context. @folio must be locked. Will not sleep. + * Return: true if all buffers attached to this folio were freed. + */ bool try_to_free_buffers(struct folio *folio) { struct address_space * const mapping = folio->mapping; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7ade836beb58..f53977169db4 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -563,8 +563,7 @@ static bool cachefiles_open_file(struct cachefiles_object *object, */ path.mnt = cache->mnt; path.dentry = dentry; - file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(dentry), cache->cache_cred); + file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred); if (IS_ERR(file)) { trace_cachefiles_vfs_error(object, d_backing_inode(dentry), PTR_ERR(file), diff --git a/fs/coredump.c b/fs/coredump.c index 317065e3eb9b..a57a06b80f57 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -376,9 +376,7 @@ static int zap_process(struct task_struct *start, int exit_code) if (t != current && !(t->flags & PF_POSTCOREDUMP)) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - /* The vhost_worker does not particpate in coredumps */ - if ((t->flags & (PF_USER_WORKER | PF_IO_WORKER)) != PF_USER_WORKER) - nr++; + nr++; } } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 9901057a15ba..460690ca0174 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -183,7 +183,7 @@ static int next_buffer; static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, unsigned int len) { - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct address_space *mapping = sb->s_bdev->bd_mapping; struct file_ra_state ra = {}; struct page *pages[BLKS_PER_BUF]; unsigned i, blocknr, buffer; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index b4002aea7cdb..40de69860dcf 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -284,7 +284,7 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh, const struct inode **inode_ret, u64 *lblk_num_ret) { - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; const struct address_space *mapping; const struct inode *inode; @@ -292,13 +292,13 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh, * The ext4 journal (jbd2) can submit a buffer_head it directly created * for a non-pagecache page. fscrypt doesn't care about these. */ - mapping = page_mapping(page); + mapping = folio_mapping(folio); if (!mapping) return false; inode = mapping->host; *inode_ret = inode; - *lblk_num_ret = ((u64)page->index << (PAGE_SHIFT - inode->i_blkbits)) + + *lblk_num_ret = ((u64)folio->index << (PAGE_SHIFT - inode->i_blkbits)) + (bh_offset(bh) >> inode->i_blkbits); return true; } @@ -1207,17 +1207,17 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, struct vm_area_struct *vma = vmf->vma; struct inode *inode = mapping->host; pgtable_t pgtable = NULL; - struct page *zero_page; + struct folio *zero_folio; spinlock_t *ptl; pmd_t pmd_entry; pfn_t pfn; - zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); + zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm); - if (unlikely(!zero_page)) + if (unlikely(!zero_folio)) goto fallback; - pfn = page_to_pfn_t(zero_page); + pfn = page_to_pfn_t(&zero_folio->page); *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE); @@ -1237,17 +1237,17 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); mm_inc_nr_ptes(vma->vm_mm); } - pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); + pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot); pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); + trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); return VM_FAULT_NOPAGE; fallback: if (pgtable) pte_free(vma->vm_mm, pgtable); - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry); return VM_FAULT_FALLBACK; } #else diff --git a/fs/dcache.c b/fs/dcache.c index 407095188f83..1ee6404b430b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2360,19 +2360,17 @@ EXPORT_SYMBOL(d_hash_and_lookup); * - unhash this dentry and free it. * * Usually, we want to just turn this into - * a negative dentry, but if anybody else is - * currently using the dentry or the inode - * we can't do that and we fall back on removing - * it from the hash queues and waiting for - * it to be deleted later when it has no users + * a negative dentry, but certain workloads can + * generate a large number of negative dentries. + * Therefore, it would be better to simply + * unhash it. */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * - * Turn the dentry into a negative dentry if possible, otherwise - * remove it from the hash queues so it can be deleted later + * Remove the dentry from the hash queues so it can be deleted later. */ void d_delete(struct dentry * dentry) @@ -2381,6 +2379,8 @@ void d_delete(struct dentry * dentry) spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); + __d_drop(dentry); + /* * Are we the only user? */ @@ -2388,7 +2388,6 @@ void d_delete(struct dentry * dentry) dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { - __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 52524bd9698b..5fc03c1e2757 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -29,11 +29,9 @@ void erofs_put_metabuf(struct erofs_buf *buf) * Derive the block size from inode->i_blkbits to make compatible with * anonymous inode in fscache mode. */ -void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, enum erofs_kmap_type type) { - struct inode *inode = buf->inode; - erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits; pgoff_t index = offset >> PAGE_SHIFT; struct page *page = buf->page; struct folio *folio; @@ -43,7 +41,7 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, erofs_put_metabuf(buf); nofs_flag = memalloc_nofs_save(); - folio = read_cache_folio(inode->i_mapping, index, NULL, NULL); + folio = read_cache_folio(buf->mapping, index, NULL, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(folio)) return folio; @@ -68,16 +66,16 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { if (erofs_is_fscache_mode(sb)) - buf->inode = EROFS_SB(sb)->s_fscache->inode; + buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping; else - buf->inode = sb->s_bdev->bd_inode; + buf->mapping = sb->s_bdev->bd_mapping; } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_blk_t blkaddr, enum erofs_kmap_type type) { erofs_init_metabuf(buf, sb); - return erofs_bread(buf, blkaddr, type); + return erofs_bread(buf, erofs_pos(sb, blkaddr), type); } static int erofs_map_blocks_flatmode(struct inode *inode, diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index b80abec0531a..2193a6710c8f 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -58,12 +58,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) int err = 0; bool initial = true; - buf.inode = dir; + buf.mapping = dir->i_mapping; while (ctx->pos < dirsize) { struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, i, EROFS_KMAP); + de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP); if (IS_ERR(de)) { erofs_err(sb, "fail to readdir of logical block %u of nid %llu", i, EROFS_I(dir)->nid); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 21def866a482..00bbb288c08c 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -216,7 +216,7 @@ enum erofs_kmap_type { }; struct erofs_buf { - struct inode *inode; + struct address_space *mapping; struct page *page; void *base; enum erofs_kmap_type kmap_type; @@ -402,7 +402,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, enum erofs_kmap_type type); void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index f0110a78acb2..c94d0c1608a8 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -99,8 +99,8 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; - buf.inode = dir; - de = erofs_bread(&buf, mid, EROFS_KMAP); + buf.mapping = dir->i_mapping; + de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); @@ -171,7 +171,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.name = name->name; qn.end = name->name + name->len; - buf.inode = dir; + buf.mapping = dir->i_mapping; ndirents = 0; de = erofs_find_target_block(&buf, dir, &qn, &ndirents); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 044c79229a78..348ef1660e50 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -132,11 +132,11 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, int len, i, cnt; *offset = round_up(*offset, 4); - ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); + ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) return ptr; - len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]); + len = le16_to_cpu(*(__le16 *)ptr); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); @@ -148,12 +148,12 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, for (i = 0; i < len; i += cnt) { cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), len - i); - ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); + ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) { kfree(buffer); return ptr; } - memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt); + memcpy(buffer + i, ptr, cnt); *offset += cnt; } return buffer; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index b58316b49a43..a90d7d649739 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -81,13 +81,13 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; } - ih = it.kaddr + erofs_blkoff(sb, it.pos); + ih = it.kaddr; vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, @@ -102,16 +102,14 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), - EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); if (IS_ERR(it.kaddr)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; ret = PTR_ERR(it.kaddr); goto out_unlock; } - vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *) - (it.kaddr + erofs_blkoff(sb, it.pos))); + vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr); it.pos += sizeof(__le32); } erofs_put_metabuf(&it.buf); @@ -185,12 +183,11 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, void *src; for (processed = 0; processed < len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); - src = it->kaddr + erofs_blkoff(sb, it->pos); + src = it->kaddr; slice = min_t(unsigned int, sb->s_blocksize - erofs_blkoff(sb, it->pos), len - processed); memcpy(it->buffer + it->buffer_ofs, src, slice); @@ -208,8 +205,7 @@ static int erofs_listxattr_foreach(struct erofs_xattr_iter *it) int err; /* 1. handle xattr entry */ - entry = *(struct erofs_xattr_entry *) - (it->kaddr + erofs_blkoff(it->sb, it->pos)); + entry = *(struct erofs_xattr_entry *)it->kaddr; it->pos += sizeof(struct erofs_xattr_entry); base_index = entry.e_name_index; @@ -259,8 +255,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) unsigned int slice, processed, value_sz; /* 1. handle xattr entry */ - entry = *(struct erofs_xattr_entry *) - (it->kaddr + erofs_blkoff(sb, it->pos)); + entry = *(struct erofs_xattr_entry *)it->kaddr; it->pos += sizeof(struct erofs_xattr_entry); value_sz = le16_to_cpu(entry.e_value_size); @@ -291,8 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) /* 2. handle xattr name */ for (processed = 0; processed < entry.e_name_len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -300,7 +294,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) sb->s_blocksize - erofs_blkoff(sb, it->pos), entry.e_name_len - processed); if (memcmp(it->name.name + it->infix_len + processed, - it->kaddr + erofs_blkoff(sb, it->pos), slice)) + it->kaddr, slice)) return -ENOATTR; it->pos += slice; } @@ -336,13 +330,11 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; while (remaining) { - it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); - entry_sz = erofs_xattr_entry_size(it->kaddr + - erofs_blkoff(it->sb, it->pos)); + entry_sz = erofs_xattr_entry_size(it->kaddr); /* xattr on-disk corruption: xattr entry beyond xattr_isize */ if (remaining < entry_sz) { DBG_BUGON(1); @@ -375,8 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, for (i = 0; i < vi->xattr_shared_count; ++i) { it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + vi->xattr_shared_xattrs[i] * sizeof(__le32); - it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -492,7 +483,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) return -ENOMEM; if (sbi->packed_inode) - buf.inode = sbi->packed_inode; + buf.mapping = sbi->packed_inode->i_mapping; else erofs_init_metabuf(&buf, sb); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 3216b920d369..283c9c3a611d 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -936,16 +936,16 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, if (!packed_inode) return -EFSCORRUPTED; - buf.inode = packed_inode; + buf.mapping = packed_inode->i_mapping; for (; cur < end; cur += cnt, pos += cnt) { cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); - src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); + src = erofs_bread(&buf, pos, EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } - memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); + memcpy_to_page(page, cur, src, cnt); } erofs_put_metabuf(&buf); return 0; diff --git a/fs/exec.c b/fs/exec.c index b3c40fbb325f..40073142288f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -67,6 +67,7 @@ #include <linux/time_namespace.h> #include <linux/user_events.h> #include <linux/rseq.h> +#include <linux/ksm.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -268,6 +269,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm) } /* + * Need to be called with mmap write lock + * held, to avoid race with ksmd. + */ + err = ksm_execve(mm); + if (err) + goto err_ksm; + + /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't @@ -288,6 +297,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) bprm->p = vma->vm_end - sizeof(void *); return 0; err: + ksm_exit(mm); +err_ksm: mmap_write_unlock(mm); err_free: bprm->vma = NULL; diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index d6cfb1849580..d5bce83ad905 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -3,7 +3,6 @@ config EXT2_FS tristate "Second extended fs support (DEPRECATED)" select BUFFER_HEAD select FS_IOMAP - select LEGACY_DIRECT_IO help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 4fb155b5a958..087457061c6e 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -175,7 +175,6 @@ Eend: (unsigned long) le32_to_cpu(p->inode)); } fail: - folio_set_error(folio); return false; } diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 4ddc36f4dbd4..10b061ac5bc0 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -302,6 +302,12 @@ static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return generic_file_write_iter(iocb, from); } +static int ext2_file_open(struct inode *inode, struct file *filp) +{ + filp->f_mode |= FMODE_CAN_ODIRECT; + return dquot_file_open(inode, filp); +} + const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, .read_iter = ext2_file_read_iter, @@ -311,7 +317,7 @@ const struct file_operations ext2_file_operations = { .compat_ioctl = ext2_compat_ioctl, #endif .mmap = ext2_file_mmap, - .open = dquot_file_open, + .open = ext2_file_open, .release = ext2_release_file, .fsync = ext2_fsync, .get_unmapped_area = thp_get_unmapped_area, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index f3d570a9302b..0caa1650cee8 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -965,7 +965,6 @@ const struct address_space_operations ext2_aops = { .write_begin = ext2_write_begin, .write_end = ext2_write_end, .bmap = ext2_bmap, - .direct_IO = noop_direct_IO, .writepages = ext2_writepages, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, @@ -974,7 +973,6 @@ const struct address_space_operations ext2_aops = { static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, }; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index ef4c19e5f570..0c5a79c3b5d4 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -68,11 +68,6 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - /* usually, the umask is applied by posix_acl_create(), but if - ext4 ACL support is disabled at compile time, we need to do - it here, because posix_acl_create() will never be called */ - inode->i_mode &= ~current_umask(); - return 0; } #endif /* CONFIG_EXT4_FS_POSIX_ACL */ diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 3985f8c33f95..ff4514e4626b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -192,7 +192,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) (PAGE_SHIFT - inode->i_blkbits); if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( - sb->s_bdev->bd_inode->i_mapping, + sb->s_bdev->bd_mapping, &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8d126654019e..983dad8c07ec 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -213,11 +213,14 @@ enum criteria { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 -/* Large fragment size list lookup succeeded at least once for cr = 0 */ +/* Large fragment size list lookup succeeded at least once for + * CR_POWER2_ALIGNED */ #define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ +/* Avg fragment size rb tree lookup succeeded at least once for + * CR_GOAL_LEN_FAST */ #define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ +/* Avg fragment size rb tree lookup succeeded at least once for + * CR_BEST_AVAIL_LEN */ #define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 5d8055161acd..da4a82456383 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -206,7 +206,7 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line, static void ext4_check_bdev_write_error(struct super_block *sb) { - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct address_space *mapping = sb->s_bdev->bd_mapping; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e57054bdc5fd..e067f2dd0335 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3402,9 +3402,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; - int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; + int allocated = 0; + unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map_len); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 28c51b0cc4db..c89e434db6b7 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -844,8 +844,7 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (err) goto out_journal; lock_buffer(sbi->s_sbh); - strncpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); + strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); @@ -885,7 +884,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 537803250ca9..4bae9ccf5fe0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1865,7 +1865,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) - len = size & ~PAGE_MASK; + len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; @@ -2334,7 +2334,7 @@ static int mpage_journal_page_buffers(handle_t *handle, if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size - folio_pos(folio); + len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } @@ -2887,9 +2887,6 @@ retry: if (IS_ERR(folio)) return PTR_ERR(folio); - /* In case writeback began while the folio was unlocked */ - folio_wait_stable(folio); - #ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else @@ -3530,7 +3527,6 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3547,7 +3543,6 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3564,7 +3559,6 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3573,7 +3567,6 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7160a71044c8..dab7acd49709 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1150,9 +1150,8 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label */ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); - memset(label, 0, sizeof(label)); lock_buffer(sbi->s_sbh); - strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX); + strscpy_pad(label, sbi->s_es->s_volume_name); unlock_buffer(sbi->s_sbh); if (copy_to_user(user_label, label, sizeof(label))) diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 044ca5238f41..bb2a223b207c 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -30,7 +30,31 @@ struct mbt_ext4_super_block { #define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) #define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) +static struct inode *mbt_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (!ei) + return NULL; + + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); + + return &ei->vfs_inode; +} + +static void mbt_free_inode(struct inode *inode) +{ + kfree(EXT4_I(inode)); +} + static const struct super_operations mbt_sops = { + .alloc_inode = mbt_alloc_inode, + .free_inode = mbt_free_inode, }; static void mbt_kill_sb(struct super_block *sb) @@ -859,6 +883,56 @@ static void test_mb_free_blocks(struct kunit *test) ext4_mb_unload_buddy(&e4b); } +#define COUNT_FOR_ESTIMATE 100000 +static void test_mb_mark_used_cost(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i, j; + unsigned long start, end, all = 0; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_group = TEST_GOAL_GROUP; + for (j = 0; j < COUNT_FOR_ESTIMATE; j++) { + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + start = jiffies; + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ex.fe_start = ranges[i].start; + ex.fe_len = ranges[i].len; + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + end = jiffies; + all += (end - start); + + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_free_blocks(NULL, &e4b, ranges[i].start, + ranges[i].len); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + } + + kunit_info(test, "costed jiffies %lu\n", all); + ext4_mb_unload_buddy(&e4b); +} + static const struct mbt_ext4_block_layout mbt_test_layouts[] = { { .blocksize_bits = 10, @@ -901,6 +975,8 @@ static struct kunit_case mbt_test_cases[] = { KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params, + { .speed = KUNIT_SPEED_SLOW }), {} }; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 12b3f196010b..9dda9cd68ab2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) return 0; if (order == MB_NUM_ORDERS(sb)) order--; + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) + order = MB_NUM_ORDERS(sb) - 1; return order; } @@ -1008,6 +1010,8 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; + if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) + order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; @@ -1076,23 +1080,11 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) } /* - * Return next linear group for allocation. If linear traversal should not be - * performed, this function just returns the same group + * Return next linear group for allocation. */ static ext4_group_t -next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, - ext4_group_t ngroups) +next_linear_group(ext4_group_t group, ext4_group_t ngroups) { - if (!should_optimize_scan(ac)) - goto inc_and_return; - - if (ac->ac_groups_linear_remaining) { - ac->ac_groups_linear_remaining--; - goto inc_and_return; - } - - return group; -inc_and_return: /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. @@ -1118,8 +1110,19 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, { *new_cr = ac->ac_criteria; - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { - *group = next_linear_group(ac, *group, ngroups); + if (!should_optimize_scan(ac)) { + *group = next_linear_group(*group, ngroups); + return; + } + + /* + * Optimized scanning can return non adjacent groups which can cause + * seek overhead for rotational disks. So try few linear groups before + * trying optimized scan. + */ + if (ac->ac_groups_linear_remaining) { + *group = next_linear_group(*group, ngroups); + ac->ac_groups_linear_remaining--; return; } @@ -1131,8 +1134,9 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* - * TODO: For CR=2, we can arrange groups in an rb tree sorted by - * bb_free. But until that happens, we should never come here. + * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an + * rb tree sorted by bb_free. But until that happens, we should + * never come here. */ WARN_ON(1); } @@ -1270,7 +1274,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * for this page; do not hold this lock when calling this routine! */ -static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) +static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; @@ -1288,13 +1292,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) char *bitmap; struct ext4_group_info *grinfo; - inode = page->mapping->host; + inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; - mb_debug(sb, "init page %lu\n", page->index); + mb_debug(sb, "init folio %lu\n", folio->index); groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) @@ -1309,9 +1313,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else bh = &bhs; - first_group = page->index * blocks_per_page / 2; + first_group = folio->index * blocks_per_page / 2; - /* read all groups the page covers into the cache */ + /* read all groups the folio covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break; @@ -1322,10 +1326,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so - * we must skip all initialized uptodate buddies on the page, + * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ - if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + if (folio_test_uptodate(folio) && + !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } @@ -1349,7 +1354,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) err = err2; } - first_block = page->index * blocks_per_page; + first_block = folio->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { group = (first_block + i) >> 1; if (group >= ngroups) @@ -1370,7 +1375,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) * above * */ - data = page_address(page) + (i * blocksize); + data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* @@ -1385,8 +1390,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug(sb, "put buddy for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -1404,8 +1409,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ @@ -1423,7 +1428,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) incore = data; } } - SetPageUptodate(page); + folio_mark_uptodate(folio); out: if (bh) { @@ -1439,7 +1444,7 @@ out: * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) @@ -1447,10 +1452,10 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; - struct page *page; + struct folio *folio; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* @@ -1461,12 +1466,13 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ @@ -1474,23 +1480,24 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, } /* blocks_per_page == 1, hence we need another page for the buddy */ - page = find_or_create_page(inode->i_mapping, block + 1, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_buddy_page = page; + folio = __filemap_get_folio(inode->i_mapping, block + 1, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_buddy_folio = folio; return 0; } static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) { - unlock_page(e4b->bd_bitmap_page); - put_page(e4b->bd_bitmap_page); + if (e4b->bd_bitmap_folio) { + folio_unlock(e4b->bd_bitmap_folio); + folio_put(e4b->bd_bitmap_folio); } - if (e4b->bd_buddy_page) { - unlock_page(e4b->bd_buddy_page); - put_page(e4b->bd_buddy_page); + if (e4b->bd_buddy_folio) { + folio_unlock(e4b->bd_buddy_folio); + folio_put(e4b->bd_buddy_folio); } } @@ -1505,7 +1512,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) struct ext4_group_info *this_grp; struct ext4_buddy e4b; - struct page *page; + struct folio *folio; int ret = 0; might_sleep(); @@ -1532,16 +1539,16 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } - page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL, gfp); + folio = e4b.bd_bitmap_folio; + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - if (e4b.bd_buddy_page == NULL) { + if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force @@ -1551,11 +1558,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } /* init buddy cache */ - page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); + folio = e4b.bd_buddy_folio; + ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } @@ -1577,7 +1584,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, int block; int pnum; int poff; - struct page *page; + struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1595,8 +1602,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* @@ -1617,102 +1624,103 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, pnum = block / blocks_per_page; poff = block % blocks_per_page; - /* we could use find_or_create_page(), but it locks page - * what we'd like to avoid in fast path ... */ - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) + /* Avoid locking the folio in the fast path ... */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) /* - * drop the page reference and try - * to get the page with lock. If we + * drop the folio reference and try + * to get the folio with lock. If we * are not uptodate that implies - * somebody just created the page but - * is yet to initialize the same. So + * somebody just created the folio but + * is yet to initialize it. So * wait for it to initialize. */ - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL, gfp); + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } - mb_cmp_bitmaps(e4b, page_address(page) + + mb_cmp_bitmaps(e4b, folio_address(folio) + (poff * sb->s_blocksize)); } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_buddy_page = page; - e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_buddy_folio = folio; + e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); return 0; err: - if (page) - put_page(page); - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); + if (!IS_ERR_OR_NULL(folio)) + folio_put(folio); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; @@ -1727,10 +1735,10 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - put_page(e4b->bd_buddy_page); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); + if (e4b->bd_buddy_folio) + folio_put(e4b->bd_buddy_folio); } @@ -2040,13 +2048,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) int ord; int mlen = 0; int max = 0; - int cur; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; - bool split = false; + int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); @@ -2071,16 +2078,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain buddy itself */ while (len) { - if (!split) - ord = mb_find_order_for_block(e4b, start); + ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; - if (!split) - buddy = mb_find_buddy(e4b, ord, &max); - else - split = false; + buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; @@ -2094,20 +2097,29 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) if (ret == 0) ret = len | (ord << 16); - /* we have to split large buddy */ BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; - ord--; - cur = (start >> ord) & ~1U; - buddy = mb_find_buddy(e4b, ord, &max); - mb_clear_bit(cur, buddy); - mb_clear_bit(cur + 1, buddy); - e4b->bd_info->bb_counters[ord]++; - e4b->bd_info->bb_counters[ord]++; - split = true; + ord_start = (start >> ord) << ord; + ord_end = ord_start + (1 << ord); + /* first chunk */ + if (start > ord_start) + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + ord_start, start - ord_start, + e4b->bd_info); + + /* last chunk */ + if (start + len < ord_end) { + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + start + len, + ord_end - (start + len), + e4b->bd_info); + break; + } + len = start + len - ord_end; + start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); @@ -2149,10 +2161,10 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ - ac->ac_bitmap_page = e4b->bd_bitmap_page; - get_page(ac->ac_bitmap_page); - ac->ac_buddy_page = e4b->bd_buddy_page; - get_page(ac->ac_buddy_page); + ac->ac_bitmap_folio = e4b->bd_bitmap_folio; + folio_get(ac->ac_bitmap_folio); + ac->ac_buddy_folio = e4b->bd_buddy_folio; + folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); @@ -2675,7 +2687,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, int ret; /* - * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this @@ -2856,6 +2868,7 @@ repeat: group = ac->ac_g_ex.fe_group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; + nr = 0; for (i = 0, new_cr = cr; i < ngroups; i++, ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { @@ -3186,7 +3199,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) -__acquires(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; @@ -3440,10 +3452,11 @@ static int ext4_mb_init_backend(struct super_block *sb) } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); - /* now many real IOs to prefetch within a single allocation at cr=0 - * given cr=0 is an CPU-related optimization we shouldn't try to - * load too many groups, at some point we should start to use what - * we've got in memory. + /* + * now many real IOs to prefetch within a single allocation at + * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related + * optimization we shouldn't try to load too many groups, at some point + * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ @@ -3884,8 +3897,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ - put_page(e4b.bd_buddy_page); - put_page(e4b.bd_bitmap_page); + folio_put(e4b.bd_buddy_folio); + folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); @@ -5989,10 +6002,10 @@ static void ext4_mb_release_context(struct ext4_allocation_context *ac) ext4_mb_put_pa(ac, ac->ac_sb, pa); } - if (ac->ac_bitmap_page) - put_page(ac->ac_bitmap_page); - if (ac->ac_buddy_page) - put_page(ac->ac_buddy_page); + if (ac->ac_bitmap_folio) + folio_put(ac->ac_bitmap_folio); + if (ac->ac_buddy_folio) + folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); @@ -6113,6 +6126,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; + *errp = 0; return block; } @@ -6307,8 +6321,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); + BUG_ON(e4b->bd_bitmap_folio == NULL); + BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; @@ -6319,8 +6333,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ - get_page(e4b->bd_buddy_page); - get_page(e4b->bd_bitmap_page); + folio_get(e4b->bd_buddy_folio); + folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 56938532b4ce..d8553f1498d3 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -187,14 +187,14 @@ struct ext4_allocation_context { struct ext4_free_extent ac_f_ex; /* - * goal len can change in CR1.5, so save the original len. This is - * used while adjusting the PA window and for accounting. + * goal len can change in CR_BEST_AVAIL_LEN, so save the original len. + * This is used while adjusting the PA window and for accounting. */ ext4_grpblk_t ac_orig_goal_len; __u32 ac_flags; /* allocation hints */ + __u32 ac_groups_linear_remaining; __u16 ac_groups_scanned; - __u16 ac_groups_linear_remaining; __u16 ac_found; __u16 ac_cX_found[EXT4_MB_NUM_CRS]; __u16 ac_tail; @@ -204,8 +204,8 @@ struct ext4_allocation_context { __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ - struct page *ac_bitmap_page; - struct page *ac_buddy_page; + struct folio *ac_bitmap_folio; + struct folio *ac_buddy_folio; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; @@ -215,9 +215,9 @@ struct ext4_allocation_context { #define AC_STATUS_BREAK 3 struct ext4_buddy { - struct page *bd_buddy_page; + struct folio *bd_buddy_folio; void *bd_buddy; - struct page *bd_bitmap_page; + struct folio *bd_bitmap_folio; void *bd_bitmap; struct ext4_group_info *bd_info; struct super_block *bd_sb; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 7cd4afa4de1d..204f53b23622 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -199,10 +199,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) continue; if (!buffer_mapped(bh)) { err = ext4_get_block(inode, block, bh, 0); - if (err) { - folio_set_error(folio); + if (err) return err; - } if (!buffer_mapped(bh)) { folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5e4f65c14dfb..a630b27a4cc6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2897,7 +2897,7 @@ retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) + 4 + EXT4_XATTR_TRANS_BLOCKS); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 312bc6813357..ad5543866d21 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -117,7 +117,6 @@ static void ext4_finish_bio(struct bio *bio) if (bio->bi_status) { int err = blk_status_to_errno(bio->bi_status); - folio_set_error(folio); mapping_set_error(folio->mapping, err); } bh = head = folio_buffers(folio); @@ -441,8 +440,6 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); - folio_clear_error(folio); - /* * Comments copied from block_write_full_folio: * diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 21e8f0aebb3c..8494492582ab 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -289,7 +289,6 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: - folio_set_error(folio); folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3fce1b80c419..c682fb927b64 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -244,7 +244,7 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { - gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS) | __GFP_MOVABLE; return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); @@ -253,7 +253,7 @@ struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { - gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS); return __ext4_sb_bread_gfp(sb, block, 0, gfp); @@ -492,22 +492,6 @@ static void ext4_maybe_update_superblock(struct super_block *sb) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } -/* - * The del_gendisk() function uninitializes the disk-specific data - * structures, including the bdi structure, without telling anyone - * else. Once this happens, any attempt to call mark_buffer_dirty() - * (for example, by ext4_commit_super), will cause a kernel OOPS. - * This is a kludge to prevent these oops until we can put in a proper - * hook in del_gendisk() to inform the VFS and file system layers. - */ -static int block_device_ejected(struct super_block *sb) -{ - struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = inode_to_bdi(bd_inode); - - return bdi->dev == NULL; -} - static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; @@ -2074,8 +2058,7 @@ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; - if (ctx->s_qf_names[qtype]) - kfree(ctx->s_qf_names[qtype]); + kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; @@ -2480,8 +2463,7 @@ static int parse_options(struct fs_context *fc, char *options) param.size = v_len; ret = ext4_parse_param(fc, ¶m); - if (param.string) - kfree(param.string); + kfree(param.string); if (ret < 0) return ret; } @@ -5338,6 +5320,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); + super_set_sysfs_name_bdev(sb); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); @@ -5547,19 +5530,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount6; - err = ext4_register_sysfs(sb); - if (err) - goto failed_mount7; - err = ext4_init_orphan_info(sb); if (err) - goto failed_mount8; + goto failed_mount7; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) - goto failed_mount9; + goto failed_mount8; } #endif /* CONFIG_QUOTA */ @@ -5568,7 +5547,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * used to detect the metadata async write error. */ spin_lock_init(&sbi->s_bdev_wb_lock); - errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); @@ -5585,7 +5564,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount10; + goto failed_mount9; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) @@ -5602,15 +5581,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); + /* Register sysfs after all initializations are complete. */ + err = ext4_register_sysfs(sb); + if (err) + goto failed_mount9; + return 0; -failed_mount10: +failed_mount9: ext4_quotas_off(sb, EXT4_MAXQUOTAS); -failed_mount9: __maybe_unused +failed_mount8: __maybe_unused ext4_release_orphan_info(sb); -failed_mount8: - ext4_unregister_sysfs(sb); - kobject_put(&sbi->s_kobj); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -5869,7 +5850,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; - set_blocksize(bdev, blocksize); + set_blocksize(bdev_file, blocksize); bh = __bread(bdev, sb_block, blocksize); if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " @@ -6126,8 +6107,8 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); - strncpy(es->s_first_error_func, sbi->s_first_error_func, - sizeof(es->s_first_error_func)); + strtomem_pad(es->s_first_error_func, + sbi->s_first_error_func, 0); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = @@ -6140,8 +6121,7 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); - strncpy(es->s_last_error_func, sbi->s_last_error_func, - sizeof(es->s_last_error_func)); + strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); @@ -6168,8 +6148,6 @@ static int ext4_commit_super(struct super_block *sb) if (!sbh) return -EINVAL; - if (block_device_ejected(sb)) - return -ENODEV; ext4_update_super(sb); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 6d332dff79dd..ddb54608ca2e 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -29,7 +29,10 @@ typedef enum { attr_trigger_test_error, attr_first_error_time, attr_last_error_time, + attr_clusters_in_group, + attr_mb_order, attr_feature, + attr_pointer_pi, attr_pointer_ui, attr_pointer_ul, attr_pointer_u64, @@ -104,7 +107,7 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (ret || val >= clusters) + if (ret || val >= clusters || (s64)val < 0) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); @@ -178,6 +181,9 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) +#define EXT4_RW_ATTR_SBI_PI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname) + #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) @@ -207,23 +213,25 @@ EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); +EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, + ext4_sb_info, s_mb_group_prealloc); +EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, + ext4_sb_info, s_mb_best_avail_max_trim_order); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -366,13 +374,45 @@ static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) #define print_tstamp(buf, es, tstamp) \ __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) +static ssize_t ext4_generic_attr_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; + + switch (a->attr_id) { + case attr_inode_readahead: + case attr_clusters_in_group: + case attr_mb_order: + case attr_pointer_pi: + case attr_pointer_ui: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); + case attr_pointer_u8: + return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); + case attr_pointer_u64: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); + return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); + case attr_pointer_string: + return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); + case attr_pointer_atomic: + return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); + } + return 0; +} + static ssize_t ext4_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, s_kobj); struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); switch (a->attr_id) { case attr_delayed_allocation_blocks: @@ -391,45 +431,6 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return sysfs_emit(buf, "%llu\n", (unsigned long long) percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); - case attr_inode_readahead: - case attr_pointer_ui: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%u\n", - le32_to_cpup(ptr)); - else - return sysfs_emit(buf, "%u\n", - *((unsigned int *) ptr)); - case attr_pointer_ul: - if (!ptr) - return 0; - return sysfs_emit(buf, "%lu\n", - *((unsigned long *) ptr)); - case attr_pointer_u8: - if (!ptr) - return 0; - return sysfs_emit(buf, "%u\n", - *((unsigned char *) ptr)); - case attr_pointer_u64: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%llu\n", - le64_to_cpup(ptr)); - else - return sysfs_emit(buf, "%llu\n", - *((unsigned long long *) ptr)); - case attr_pointer_string: - if (!ptr) - return 0; - return sysfs_emit(buf, "%.*s\n", a->attr_size, - (char *) ptr); - case attr_pointer_atomic: - if (!ptr) - return 0; - return sysfs_emit(buf, "%d\n", - atomic_read((atomic_t *) ptr)); case attr_feature: return sysfs_emit(buf, "supported\n"); case attr_first_error_time: @@ -438,29 +439,34 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_last_error_time); case attr_journal_task: return journal_task_show(sbi, buf); + default: + return ext4_generic_attr_show(a, sbi, buf); } - - return 0; } -static ssize_t ext4_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) +static ssize_t ext4_generic_attr_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t len) { - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); - unsigned long t; int ret; + unsigned int t; + unsigned long lt; + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; switch (a->attr_id) { - case attr_reserved_clusters: - return reserved_clusters_store(sbi, buf, len); + case attr_pointer_pi: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if ((int)t < 0) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_pointer_ui: - if (!ptr) - return 0; - ret = kstrtoul(skip_spaces(buf), 0, &t); + ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; if (a->attr_ptr == ptr_ext4_super_block_offset) @@ -468,20 +474,50 @@ static ssize_t ext4_attr_store(struct kobject *kobj, else *((unsigned int *) ptr) = t; return len; + case attr_mb_order: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > 64) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; + case attr_clusters_in_group: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > sbi->s_clusters_per_group) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_pointer_ul: - if (!ptr) - return 0; - ret = kstrtoul(skip_spaces(buf), 0, &t); + ret = kstrtoul(skip_spaces(buf), 0, <); if (ret) return ret; - *((unsigned long *) ptr) = t; + *((unsigned long *) ptr) = lt; return len; + } + return 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + switch (a->attr_id) { + case attr_reserved_clusters: + return reserved_clusters_store(sbi, buf, len); case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: return trigger_test_error(sbi, buf, len); + default: + return ext4_generic_attr_store(a, sbi, buf, len); } - return 0; } static void ext4_sb_release(struct kobject *kobj) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b67a176bfcf9..6460879b9fcb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1619,6 +1619,7 @@ out_err: static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, + struct inode *new_ea_inode, bool is_block) { struct ext4_xattr_entry *last, *next; @@ -1626,7 +1627,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; - struct inode *new_ea_inode = NULL; size_t old_size, new_size; int ret; @@ -1711,38 +1711,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, old_ea_inode = NULL; goto out; } - } - if (i->value && in_inode) { - WARN_ON_ONCE(!i->value_len); - - new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode, - i->value, i->value_len); - if (IS_ERR(new_ea_inode)) { - ret = PTR_ERR(new_ea_inode); - new_ea_inode = NULL; - goto out; - } - } - if (old_ea_inode) { /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); - if (ret) { - /* Release newly required ref count on new_ea_inode. */ - if (new_ea_inode) { - int err; - - err = ext4_xattr_inode_dec_ref(handle, - new_ea_inode); - if (err) - ext4_warning_inode(new_ea_inode, - "dec ref new_ea_inode err=%d", - err); - ext4_xattr_inode_free_quota(inode, new_ea_inode, - i->value_len); - } + if (ret) goto out; - } ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); @@ -1866,7 +1839,6 @@ update_hash: ret = 0; out: iput(old_ea_inode); - iput(new_ea_inode); return ret; } @@ -1929,9 +1901,21 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, size_t old_ea_inode_quota = 0; unsigned int ea_ino; - #define header(x) ((struct ext4_xattr_header *)(x)) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) { + error = PTR_ERR(ea_inode); + ea_inode = NULL; + goto cleanup; + } + } + if (s->base) { int offset = (char *)s->here - bs->bh->b_data; @@ -1940,6 +1924,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, EXT4_JTR_NONE); if (error) goto cleanup; + lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { @@ -1966,7 +1951,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, - true /* is_block */); + ea_inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -2034,33 +2019,22 @@ clone_block: s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; - if (i->value && s->here->e_value_inum) { - /* - * A ref count on ea_inode has been taken as part of the call to - * ext4_xattr_set_entry() above. We would like to drop this - * extra ref but we have to wait until the xattr block is - * initialized and has its own ref count on the ea_inode. - */ - ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, - le32_to_cpu(s->here->e_hash), - &ea_inode); - if (error) { - ea_inode = NULL; +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); + if (IS_ERR(new_bh)) { + error = PTR_ERR(new_bh); + new_bh = NULL; goto cleanup; } - } -inserted: - if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_block_cache_find(inode, header(s->base), - &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) @@ -2158,6 +2132,17 @@ getblk_failed: ENTRY(header(s->base)+1)); if (error) goto getblk_failed; + if (ea_inode) { + /* Drop the extra ref on ea_inode. */ + error = ext4_xattr_inode_dec_ref(handle, + ea_inode); + if (error) + ext4_warning_inode(ea_inode, + "dec ref error=%d", + error); + iput(ea_inode); + ea_inode = NULL; + } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, sb, @@ -2198,17 +2183,16 @@ getblk_failed: cleanup: if (ea_inode) { - int error2; - - error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); - if (error2) - ext4_warning_inode(ea_inode, "dec ref error=%d", - error2); + if (error) { + int error2; - /* If there was an error, revert the quota charge. */ - if (error) + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); + } iput(ea_inode); } if (ce) @@ -2266,14 +2250,38 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; + struct inode *ea_inode = NULL; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; - error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); - if (error) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + } + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + false /* is_block */); + if (error) { + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + ext4_xattr_inode_free_quota(inode, ea_inode, + i_size_read(ea_inode)); + iput(ea_inode); + } return error; + } header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); @@ -2282,6 +2290,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } + iput(ea_inode); return 0; } @@ -3090,8 +3099,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, * * Find an identical extended attribute block. * - * Returns a pointer to the block found, or NULL if such a block was - * not found or an error occurred. + * Returns a pointer to the block found, or NULL if such a block was not + * found, or an error pointer if an error occurred while reading ea block. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, @@ -3113,11 +3122,11 @@ ext4_xattr_block_cache_find(struct inode *inode, bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { - if (PTR_ERR(bh) == -ENOMEM) - return NULL; - bh = NULL; - EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long)ce->e_value); + if (PTR_ERR(bh) != -ENOMEM) + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long)ce->e_value); + mb_cache_entry_put(ea_block_cache, ce); + return bh; } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index eac698b8dd38..55d444bec5c0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -179,22 +179,22 @@ static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, break; case META_SIT: if (unlikely(blkaddr >= SIT_BLK_CNT(sbi))) - goto err; + goto check_only; break; case META_SSA: if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) || blkaddr < SM_I(sbi)->ssa_blkaddr)) - goto err; + goto check_only; break; case META_CP: if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr || blkaddr < __start_cp_addr(sbi))) - goto err; + goto check_only; break; case META_POR: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) - goto err; + goto check_only; break; case DATA_GENERIC: case DATA_GENERIC_ENHANCE: @@ -228,6 +228,7 @@ static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, return true; err: f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); +check_only: return false; } @@ -345,7 +346,7 @@ static int __f2fs_write_meta_page(struct page *page, { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - trace_f2fs_writepage(page, META); + trace_f2fs_writepage(page_folio(page), META); if (unlikely(f2fs_cp_error(sbi))) { if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { @@ -492,7 +493,7 @@ stop: static bool f2fs_dirty_meta_folio(struct address_space *mapping, struct folio *folio) { - trace_f2fs_set_page_dirty(&folio->page, META); + trace_f2fs_set_page_dirty(folio, META); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 8892c8262141..1ef82a546391 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -198,8 +198,8 @@ static int lzo_compress_pages(struct compress_ctx *cc) ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, &cc->clen, cc->private); if (ret != LZO_E_OK) { - printk_ratelimited("%sF2FS-fs (%s): lzo compress failed, ret:%d\n", - KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "lzo compress failed, ret:%d", ret); return -EIO; } return 0; @@ -212,17 +212,15 @@ static int lzo_decompress_pages(struct decompress_io_ctx *dic) ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, dic->rbuf, &dic->rlen); if (ret != LZO_E_OK) { - printk_ratelimited("%sF2FS-fs (%s): lzo decompress failed, ret:%d\n", - KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + "lzo decompress failed, ret:%d", ret); return -EIO; } if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { - printk_ratelimited("%sF2FS-fs (%s): lzo invalid rlen:%zu, " - "expected:%lu\n", KERN_ERR, - F2FS_I_SB(dic->inode)->sb->s_id, - dic->rlen, - PAGE_SIZE << dic->log_cluster_size); + f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + "lzo invalid rlen:%zu, expected:%lu", + dic->rlen, PAGE_SIZE << dic->log_cluster_size); return -EIO; } return 0; @@ -294,16 +292,15 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, dic->clen, dic->rlen); if (ret < 0) { - printk_ratelimited("%sF2FS-fs (%s): lz4 decompress failed, ret:%d\n", - KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + "lz4 decompress failed, ret:%d", ret); return -EIO; } if (ret != PAGE_SIZE << dic->log_cluster_size) { - printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, " - "expected:%lu\n", KERN_ERR, - F2FS_I_SB(dic->inode)->sb->s_id, ret, - PAGE_SIZE << dic->log_cluster_size); + f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + "lz4 invalid ret:%d, expected:%lu", + ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; } return 0; @@ -350,9 +347,8 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) stream = zstd_init_cstream(¶ms, 0, workspace, workspace_size); if (!stream) { - printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_cstream failed\n", - KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, - __func__); + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "%s zstd_init_cstream failed", __func__); kvfree(workspace); return -EIO; } @@ -390,16 +386,16 @@ static int zstd_compress_pages(struct compress_ctx *cc) ret = zstd_compress_stream(stream, &outbuf, &inbuf); if (zstd_is_error(ret)) { - printk_ratelimited("%sF2FS-fs (%s): %s zstd_compress_stream failed, ret: %d\n", - KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "%s zstd_compress_stream failed, ret: %d", __func__, zstd_get_error_code(ret)); return -EIO; } ret = zstd_end_stream(stream, &outbuf); if (zstd_is_error(ret)) { - printk_ratelimited("%sF2FS-fs (%s): %s zstd_end_stream returned %d\n", - KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "%s zstd_end_stream returned %d", __func__, zstd_get_error_code(ret)); return -EIO; } @@ -432,9 +428,8 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) stream = zstd_init_dstream(max_window_size, workspace, workspace_size); if (!stream) { - printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_dstream failed\n", - KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, - __func__); + f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + "%s zstd_init_dstream failed", __func__); kvfree(workspace); return -EIO; } @@ -469,16 +464,15 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic) ret = zstd_decompress_stream(stream, &outbuf, &inbuf); if (zstd_is_error(ret)) { - printk_ratelimited("%sF2FS-fs (%s): %s zstd_decompress_stream failed, ret: %d\n", - KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, + f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + "%s zstd_decompress_stream failed, ret: %d", __func__, zstd_get_error_code(ret)); return -EIO; } if (dic->rlen != outbuf.pos) { - printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, " - "expected:%lu\n", KERN_ERR, - F2FS_I_SB(dic->inode)->sb->s_id, + f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + "%s ZSTD invalid rlen:%zu, expected:%lu", __func__, dic->rlen, PAGE_SIZE << dic->log_cluster_size); return -EIO; @@ -1031,6 +1025,31 @@ static void set_cluster_writeback(struct compress_ctx *cc) } } +static void cancel_cluster_writeback(struct compress_ctx *cc, + struct compress_io_ctx *cic, int submitted) +{ + int i; + + /* Wait for submitted IOs. */ + if (submitted > 1) { + f2fs_submit_merged_write(F2FS_I_SB(cc->inode), DATA); + while (atomic_read(&cic->pending_pages) != + (cc->valid_nr_cpages - submitted + 1)) + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + } + + /* Cancel writeback and stay locked. */ + for (i = 0; i < cc->cluster_size; i++) { + if (i < submitted) { + inode_inc_dirty_pages(cc->inode); + lock_page(cc->rpages[i]); + } + clear_page_private_gcing(cc->rpages[i]); + if (folio_test_writeback(page_folio(cc->rpages[i]))) + end_page_writeback(cc->rpages[i]); + } +} + static void set_cluster_dirty(struct compress_ctx *cc) { int i; @@ -1232,7 +1251,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, .page = NULL, .encrypted_page = NULL, .compressed_page = NULL, - .submitted = 0, .io_type = io_type, .io_wbc = wbc, .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ? @@ -1358,7 +1376,16 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, fio.compressed_page = cc->cpages[i - 1]; cc->cpages[i - 1] = NULL; + fio.submitted = 0; f2fs_outplace_write_data(&dn, &fio); + if (unlikely(!fio.submitted)) { + cancel_cluster_writeback(cc, cic, i); + + /* To call fscrypt_finalize_bounce_page */ + i = cc->valid_nr_cpages; + *submitted = 0; + goto out_destroy_crypt; + } (*submitted)++; unlock_continue: inode_dec_dirty_pages(cc->inode); @@ -1392,8 +1419,11 @@ unlock_continue: out_destroy_crypt: page_array_free(cc->inode, cic->rpages, cc->cluster_size); - for (--i; i >= 0; i--) + for (--i; i >= 0; i--) { + if (!cc->cpages[i]) + continue; fscrypt_finalize_bounce_page(&cc->cpages[i]); + } out_put_cic: kmem_cache_free(cic_entry_slab, cic); out_put_dnode: @@ -1484,7 +1514,7 @@ continue_unlock: if (!PageDirty(cc->rpages[i])) goto continue_unlock; - if (PageWriteback(cc->rpages[i])) { + if (folio_test_writeback(page_folio(cc->rpages[i]))) { if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d9494b5fc7c1..b9b0debc6b3d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -465,6 +465,8 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; + bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, + fio->type, fio->temp); } iostat_alloc_and_bind_ctx(sbi, bio, NULL); @@ -593,17 +595,20 @@ int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) return -ENOMEM; for (j = HOT; j < n; j++) { - init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); - sbi->write_io[i][j].sbi = sbi; - sbi->write_io[i][j].bio = NULL; - spin_lock_init(&sbi->write_io[i][j].io_lock); - INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); - INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + struct f2fs_bio_info *io = &sbi->write_io[i][j]; + + init_f2fs_rwsem(&io->io_rwsem); + io->sbi = sbi; + io->bio = NULL; + io->last_block_in_bio = 0; + spin_lock_init(&io->io_lock); + INIT_LIST_HEAD(&io->io_list); + INIT_LIST_HEAD(&io->bio_list); + init_f2fs_rwsem(&io->bio_list_lock); #ifdef CONFIG_BLK_DEV_ZONED - init_completion(&sbi->write_io[i][j].zone_wait); - sbi->write_io[i][j].zone_pending_bio = NULL; - sbi->write_io[i][j].bi_private = NULL; + init_completion(&io->zone_wait); + io->zone_pending_bio = NULL; + io->bi_private = NULL; #endif } } @@ -1507,6 +1512,25 @@ static bool f2fs_map_blocks_cached(struct inode *inode, return true; } +static bool map_is_mergeable(struct f2fs_sb_info *sbi, + struct f2fs_map_blocks *map, + block_t blkaddr, int flag, int bidx, + int ofs) +{ + if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) + return false; + if (map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) + return true; + if (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) + return true; + if (flag == F2FS_GET_BLOCK_PRE_DIO) + return true; + if (flag == F2FS_GET_BLOCK_DIO && + map->m_pblk == NULL_ADDR && blkaddr == NULL_ADDR) + return true; + return false; +} + /* * f2fs_map_blocks() tries to find or build mapping relationship which * maps continuous logical blocks to physical blocks, and return such @@ -1574,8 +1598,9 @@ next_block: } /* use out-place-update for direct IO under LFS mode */ - if (map->m_may_create && - (is_hole || (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO))) { + if (map->m_may_create && (is_hole || + (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && + !f2fs_is_pinned_file(inode)))) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto sync_out; @@ -1628,6 +1653,10 @@ next_block: goto sync_out; } break; + case F2FS_GET_BLOCK_DIO: + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + break; default: /* for defragment case */ if (map->m_next_pgofs) @@ -1646,19 +1675,15 @@ next_block: /* reserved delalloc block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) map->m_flags |= F2FS_MAP_DELALLOC; - map->m_flags |= F2FS_MAP_MAPPED; + if (flag != F2FS_GET_BLOCK_DIO || !is_hole) + map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr; map->m_len = 1; if (map->m_multidev_dio) map->m_bdev = FDEV(bidx).bdev; - } else if ((map->m_pblk != NEW_ADDR && - blkaddr == (map->m_pblk + ofs)) || - (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || - flag == F2FS_GET_BLOCK_PRE_DIO) { - if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) - goto sync_out; + } else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) { ofs++; map->m_len++; } else { @@ -2042,7 +2067,7 @@ static inline loff_t f2fs_readpage_limit(struct inode *inode) return i_size_read(inode); } -static int f2fs_read_single_page(struct inode *inode, struct page *page, +static int f2fs_read_single_page(struct inode *inode, struct folio *folio, unsigned nr_pages, struct f2fs_map_blocks *map, struct bio **bio_ret, @@ -2055,9 +2080,10 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, sector_t last_block; sector_t last_block_in_file; sector_t block_nr; + pgoff_t index = folio_index(folio); int ret = 0; - block_in_file = (sector_t)page_index(page); + block_in_file = (sector_t)index; last_block = block_in_file + nr_pages; last_block_in_file = bytes_to_blks(inode, f2fs_readpage_limit(inode) + blocksize - 1); @@ -2088,7 +2114,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, got_it: if ((map->m_flags & F2FS_MAP_MAPPED)) { block_nr = map->m_pblk + block_in_file - map->m_lblk; - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { @@ -2097,15 +2123,15 @@ got_it: } } else { zero_out: - zero_user_segment(page, 0, PAGE_SIZE); - if (f2fs_need_verity(inode, page->index) && - !fsverity_verify_page(page)) { + folio_zero_segment(folio, 0, folio_size(folio)); + if (f2fs_need_verity(inode, index) && + !fsverity_verify_folio(folio)) { ret = -EIO; goto out; } - if (!PageUptodate(page)) - SetPageUptodate(page); - unlock_page(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + folio_unlock(folio); goto out; } @@ -2115,14 +2141,14 @@ zero_out: */ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, *last_block_in_bio, block_nr) || - !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) { submit_and_realloc: f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, - is_readahead ? REQ_RAHEAD : 0, page->index, + is_readahead ? REQ_RAHEAD : 0, index, false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); @@ -2137,7 +2163,7 @@ submit_and_realloc: */ f2fs_wait_on_block_writeback(inode, block_nr); - if (bio_add_page(bio, page, blocksize, 0) < blocksize) + if (!bio_add_folio(bio, folio, blocksize, 0)) goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); @@ -2324,7 +2350,7 @@ out: * Major change was from block_size == page_size in f2fs by default. */ static int f2fs_mpage_readpages(struct inode *inode, - struct readahead_control *rac, struct page *page) + struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -2344,6 +2370,7 @@ static int f2fs_mpage_readpages(struct inode *inode, #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; + pgoff_t index; int ret = 0; map.m_pblk = 0; @@ -2357,64 +2384,63 @@ static int f2fs_mpage_readpages(struct inode *inode, for (; nr_pages; nr_pages--) { if (rac) { - page = readahead_page(rac); - prefetchw(&page->flags); + folio = readahead_folio(rac); + prefetchw(&folio->flags); } -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_compressed_file(inode)) { - /* there are remained compressed pages, submit them */ - if (!f2fs_cluster_can_merge_page(&cc, page->index)) { - ret = f2fs_read_multi_pages(&cc, &bio, - max_nr_pages, - &last_block_in_bio, - rac != NULL, false); - f2fs_destroy_compress_ctx(&cc, false); - if (ret) - goto set_error_page; - } - if (cc.cluster_idx == NULL_CLUSTER) { - if (nc_cluster_idx == - page->index >> cc.log_cluster_size) { - goto read_single_page; - } - - ret = f2fs_is_compressed_cluster(inode, page->index); - if (ret < 0) - goto set_error_page; - else if (!ret) { - nc_cluster_idx = - page->index >> cc.log_cluster_size; - goto read_single_page; - } + index = folio_index(folio); - nc_cluster_idx = NULL_CLUSTER; - } - ret = f2fs_init_compress_ctx(&cc); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!f2fs_compressed_file(inode)) + goto read_single_page; + + /* there are remained compressed pages, submit them */ + if (!f2fs_cluster_can_merge_page(&cc, index)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + rac != NULL, false); + f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; + } + if (cc.cluster_idx == NULL_CLUSTER) { + if (nc_cluster_idx == index >> cc.log_cluster_size) + goto read_single_page; - f2fs_compress_ctx_add_page(&cc, page); + ret = f2fs_is_compressed_cluster(inode, index); + if (ret < 0) + goto set_error_page; + else if (!ret) { + nc_cluster_idx = + index >> cc.log_cluster_size; + goto read_single_page; + } - goto next_page; + nc_cluster_idx = NULL_CLUSTER; } + ret = f2fs_init_compress_ctx(&cc); + if (ret) + goto set_error_page; + + f2fs_compress_ctx_add_page(&cc, &folio->page); + + goto next_page; read_single_page: #endif - ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, + ret = f2fs_read_single_page(inode, folio, max_nr_pages, &map, &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: #endif - zero_user_segment(page, 0, PAGE_SIZE); - unlock_page(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_unlock(folio); } #ifdef CONFIG_F2FS_FS_COMPRESSION next_page: #endif - if (rac) - put_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { @@ -2436,22 +2462,21 @@ next_page: static int f2fs_read_data_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; int ret = -EAGAIN; - trace_f2fs_readpage(page, DATA); + trace_f2fs_readpage(folio, DATA); if (!f2fs_is_compress_backend_ready(inode)) { - unlock_page(page); + folio_unlock(folio); return -EOPNOTSUPP; } /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) - ret = f2fs_read_inline_data(inode, page); + ret = f2fs_read_inline_data(inode, folio); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(inode, NULL, page); + ret = f2fs_mpage_readpages(inode, NULL, folio); return ret; } @@ -2685,12 +2710,11 @@ got_it: if (err) { if (fscrypt_inode_uses_fs_layer_crypto(inode)) fscrypt_finalize_bounce_page(&fio->encrypted_page); - if (PageWriteback(page)) - end_page_writeback(page); + end_page_writeback(page); } else { set_inode_flag(inode, FI_UPDATE_WRITE); } - trace_f2fs_do_write_data_page(fio->page, IPU); + trace_f2fs_do_write_data_page(page_folio(page), IPU); return err; } @@ -2719,7 +2743,7 @@ got_it: /* LFS mode write path */ f2fs_outplace_write_data(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); + trace_f2fs_do_write_data_page(page_folio(page), OPU); set_inode_flag(inode, FI_APPEND_WRITE); out_writepage: f2fs_put_dnode(&dn); @@ -2766,7 +2790,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .last_block = last_block, }; - trace_f2fs_writepage(page, DATA); + trace_f2fs_writepage(page_folio(page), DATA); /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { @@ -3379,7 +3403,7 @@ restart: if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA(inode)) { - f2fs_do_read_inline_data(page, ipage); + f2fs_do_read_inline_data(page_folio(page), ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_page_private_inline(ipage); @@ -3740,7 +3764,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, { struct inode *inode = mapping->host; - trace_f2fs_set_page_dirty(&folio->page, DATA); + trace_f2fs_set_page_dirty(folio, DATA); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); @@ -3896,15 +3920,14 @@ static int check_swap_activate(struct swap_info_struct *sis, struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - sector_t cur_lblock; - sector_t last_lblock; - sector_t pblock; - sector_t lowest_pblock = -1; - sector_t highest_pblock = 0; + block_t cur_lblock; + block_t last_lblock; + block_t pblock; + block_t lowest_pblock = -1; + block_t highest_pblock = 0; int nr_extents = 0; - unsigned long nr_pblocks; + unsigned int nr_pblocks; unsigned int blks_per_sec = BLKS_PER_SEC(sbi); - unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1; unsigned int not_aligned = 0; int ret = 0; @@ -3942,8 +3965,8 @@ retry: pblock = map.m_pblk; nr_pblocks = map.m_len; - if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask || - nr_pblocks & sec_blks_mask || + if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || + nr_pblocks % blks_per_sec || !f2fs_valid_pinned_area(sbi, pblock)) { bool last_extent = false; @@ -4082,11 +4105,12 @@ const struct address_space_operations f2fs_dblock_aops = { void f2fs_clear_page_cache_dirty_tag(struct page *page) { - struct address_space *mapping = page_mapping(page); + struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - __xa_clear_mark(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -4159,7 +4183,8 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = bytes_to_blks(inode, offset); map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; map.m_next_pgofs = &next_pgofs; - map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), + inode->i_write_hint); if (flags & IOMAP_WRITE) map.m_may_create = true; @@ -4180,12 +4205,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, * We should never see delalloc or compressed extents here based on * prior flushing and checks. */ - if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) - return -EINVAL; if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR)) return -EINVAL; - if (map.m_pblk != NULL_ADDR) { + if (map.m_flags & F2FS_MAP_MAPPED) { + if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) + return -EINVAL; + iomap->length = blks_to_bytes(inode, map.m_len); iomap->type = IOMAP_MAPPED; iomap->flags |= IOMAP_F_MERGED; @@ -4194,9 +4220,17 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } else { if (flags & IOMAP_WRITE) return -ENOTBLK; - iomap->length = blks_to_bytes(inode, next_pgofs) - - iomap->offset; - iomap->type = IOMAP_HOLE; + + if (map.m_pblk == NULL_ADDR) { + iomap->length = blks_to_bytes(inode, next_pgofs) - + iomap->offset; + iomap->type = IOMAP_HOLE; + } else if (map.m_pblk == NEW_ADDR) { + iomap->length = blks_to_bytes(inode, map.m_len); + iomap->type = IOMAP_UNWRITTEN; + } else { + f2fs_bug_on(F2FS_I_SB(inode), 1); + } iomap->addr = IOMAP_NULL_ADDR; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fced2b7652f4..1974b6aff397 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -72,7 +72,7 @@ enum { struct f2fs_fault_info { atomic_t inject_ops; - unsigned int inject_rate; + int inject_rate; unsigned int inject_type; }; @@ -765,11 +765,6 @@ enum { #define DEF_DIR_LEVEL 0 -enum { - GC_FAILURE_PIN, - MAX_GC_FAILURE -}; - /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -816,9 +811,10 @@ struct f2fs_inode_info { unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned char i_dir_level; /* use for dentry level for large dir */ - unsigned int i_current_depth; /* only for directory depth */ - /* for gc failure statistic */ - unsigned int i_gc_failures[MAX_GC_FAILURE]; + union { + unsigned int i_current_depth; /* only for directory depth */ + unsigned short i_gc_failures; /* for gc failure statistic */ + }; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ @@ -1557,6 +1553,7 @@ struct f2fs_sb_info { #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int max_open_zones; /* max open zone resources of the zoned device */ #endif /* for node-related operations */ @@ -1676,7 +1673,7 @@ struct f2fs_sb_info { unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ - u64 gc_pin_file_threshold; + unsigned short gc_pin_file_threshold; struct f2fs_rwsem pin_sem; /* maximum # of trials to find a victim segment for SSR and GC */ @@ -2309,7 +2306,7 @@ static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count, bool partial) { - blkcnt_t diff = 0, release = 0; + long long diff = 0, release = 0; block_t avail_user_block_count; int ret; @@ -2329,26 +2326,27 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); spin_lock(&sbi->stat_lock); - sbi->total_valid_block_count += (block_t)(*count); - avail_user_block_count = get_available_block_count(sbi, inode, true); - if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + avail_user_block_count = get_available_block_count(sbi, inode, true); + diff = (long long)sbi->total_valid_block_count + *count - + avail_user_block_count; + if (unlikely(diff > 0)) { if (!partial) { spin_unlock(&sbi->stat_lock); + release = *count; goto enospc; } - - diff = sbi->total_valid_block_count - avail_user_block_count; if (diff > *count) diff = *count; *count -= diff; release = diff; - sbi->total_valid_block_count -= diff; if (!*count) { spin_unlock(&sbi->stat_lock); goto enospc; } } + sbi->total_valid_block_count += (block_t)(*count); + spin_unlock(&sbi->stat_lock); if (unlikely(release)) { @@ -3132,7 +3130,7 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) static inline void f2fs_i_gc_failures_write(struct inode *inode, unsigned int count) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = count; + F2FS_I(inode)->i_gc_failures = count; f2fs_mark_inode_dirty_sync(inode, true); } @@ -3497,6 +3495,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, + bool readonly); int f2fs_precache_extents(struct inode *inode); int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int f2fs_fileattr_set(struct mnt_idmap *idmap, @@ -3719,6 +3719,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); +int f2fs_get_segment_temp(int seg_type); int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, @@ -3741,7 +3742,9 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); -int f2fs_rw_hint_to_seg_type(enum rw_hint hint); +int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint); +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp); unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int segno); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, @@ -4148,10 +4151,10 @@ extern struct kmem_cache *f2fs_inode_entry_slab; bool f2fs_may_inline_data(struct inode *inode); bool f2fs_sanity_check_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); -void f2fs_do_read_inline_data(struct page *page, struct page *ipage); +void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage); void f2fs_truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); -int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_read_inline_data(struct inode *inode, struct folio *folio); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); @@ -4596,10 +4599,14 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) } #ifdef CONFIG_F2FS_FAULT_INJECTION -extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, - unsigned int type); +extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, + unsigned long type); #else -#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) +static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, + unsigned long rate, unsigned long type) +{ + return 0; +} #endif static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) @@ -4657,7 +4664,7 @@ static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi, page = find_get_page(META_MAPPING(sbi), blkaddr + i); if (page) { - if (PageWriteback(page)) + if (folio_test_writeback(page_folio(page))) need_submit = true; f2fs_put_page(page, 0); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2b65e09822d4..5c0b281a70f3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -58,7 +58,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - bool need_alloc = true; + bool need_alloc = !f2fs_is_pinned_file(inode); int err = 0; vm_fault_t ret; @@ -115,19 +115,18 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto out_sem; } + set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_alloc) { /* block allocation */ - set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_block_locked(&dn, page->index); - } - -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (!need_alloc) { - set_new_dnode(&dn, inode, NULL, NULL, 0); + } else { err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); f2fs_put_dnode(&dn); + if (f2fs_is_pinned_file(inode) && + !__is_valid_data_blkaddr(dn.data_blkaddr)) + err = -EIO; } -#endif + if (err) { unlock_page(page); goto out_sem; @@ -834,7 +833,8 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw) * for blkzoned device, fallback direct IO to buffered IO, so * all IOs can be serialized by log-structured write. */ - if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE) && + !f2fs_is_pinned_file(inode)) return true; if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) return true; @@ -952,9 +952,14 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; - if ((attr->ia_valid & ATTR_SIZE) && - !f2fs_is_compress_backend_ready(inode)) - return -EOPNOTSUPP; + if ((attr->ia_valid & ATTR_SIZE)) { + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) && + !IS_ALIGNED(attr->ia_size, + F2FS_BLK_TO_BYTES(F2FS_I(inode)->i_cluster_size))) + return -EINVAL; + } err = setattr_prepare(idmap, dentry, attr); if (err) @@ -1325,6 +1330,9 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, f2fs_put_page(psrc, 1); return PTR_ERR(pdst); } + + f2fs_wait_on_page_writeback(pdst, DATA, true, true); + memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE); set_page_dirty(pdst); set_page_private_gcing(pdst); @@ -1817,15 +1825,6 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; - /* - * Pinned file should not support partial truncation since the block - * can be used by applications. - */ - if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && - (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | - FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) - return -EOPNOTSUPP; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) @@ -1833,6 +1832,17 @@ static long f2fs_fallocate(struct file *file, int mode, inode_lock(inode); + /* + * Pinned file should not support partial truncation since the block + * can be used by applications. + */ + if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) { + ret = -EOPNOTSUPP; + goto out; + } + ret = file_modified(file); if (ret) goto out; @@ -2224,34 +2234,13 @@ static int f2fs_ioc_abort_atomic_write(struct file *filp) return ret; } -static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) +int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, + bool readonly) { - struct inode *inode = file_inode(filp); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; - __u32 in; int ret = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (get_user(in, (__u32 __user *)arg)) - return -EFAULT; - - if (in != F2FS_GOING_DOWN_FULLSYNC) { - ret = mnt_want_write_file(filp); - if (ret) { - if (ret == -EROFS) { - ret = 0; - f2fs_stop_checkpoint(sbi, false, - STOP_CP_REASON_SHUTDOWN); - trace_f2fs_shutdown(sbi, in, ret); - } - return ret; - } - } - - switch (in) { + switch (flag) { case F2FS_GOING_DOWN_FULLSYNC: ret = bdev_freeze(sb->s_bdev); if (ret) @@ -2290,6 +2279,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) goto out; } + if (readonly) + goto out; + f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); @@ -2298,10 +2290,44 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_update_time(sbi, REQ_TIME); out: - if (in != F2FS_GOING_DOWN_FULLSYNC) - mnt_drop_write_file(filp); - trace_f2fs_shutdown(sbi, in, ret); + trace_f2fs_shutdown(sbi, flag, ret); + + return ret; +} + +static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + __u32 in; + int ret; + bool need_drop = false, readonly = false; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__u32 __user *)arg)) + return -EFAULT; + + if (in != F2FS_GOING_DOWN_FULLSYNC) { + ret = mnt_want_write_file(filp); + if (ret) { + if (ret != -EROFS) + return ret; + + /* fallback to nosync shutdown for readonly fs */ + in = F2FS_GOING_DOWN_NOSYNC; + readonly = true; + } else { + need_drop = true; + } + } + + ret = f2fs_do_shutdown(sbi, in, readonly); + + if (need_drop) + mnt_drop_write_file(filp); return ret; } @@ -2354,13 +2380,14 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); + int ret; if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode))) return -EOPNOTSUPP; + ret = fscrypt_ioctl_set_policy(filp, (const void __user *)arg); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - - return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); + return ret; } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) @@ -2607,12 +2634,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, bool fragmented = false; int err; - pg_start = range->start >> PAGE_SHIFT; - pg_end = (range->start + range->len) >> PAGE_SHIFT; - f2fs_balance_fs(sbi, true); inode_lock(inode); + pg_start = range->start >> PAGE_SHIFT; + pg_end = min_t(pgoff_t, + (range->start + range->len) >> PAGE_SHIFT, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { err = -EINVAL; @@ -2627,8 +2655,9 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, } /* writeback all dirty pages in the range */ - err = filemap_write_and_wait_range(inode->i_mapping, range->start, - range->start + range->len - 1); + err = filemap_write_and_wait_range(inode->i_mapping, + pg_start << PAGE_SHIFT, + (pg_end << PAGE_SHIFT) - 1); if (err) goto out; @@ -2786,7 +2815,8 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) err = f2fs_defragment_range(sbi, filp, &range); mnt_drop_write_file(filp); - f2fs_update_time(sbi, REQ_TIME); + if (range.len) + f2fs_update_time(sbi, REQ_TIME); if (err < 0) return err; @@ -2837,7 +2867,8 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out; } - if (f2fs_compressed_file(src) || f2fs_compressed_file(dst)) { + if (f2fs_compressed_file(src) || f2fs_compressed_file(dst) || + f2fs_is_pinned_file(src) || f2fs_is_pinned_file(dst)) { ret = -EOPNOTSUPP; goto out_unlock; } @@ -3189,18 +3220,17 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - /* Use i_gc_failures for normal file as a risk signal. */ - if (inc) - f2fs_i_gc_failures_write(inode, - fi->i_gc_failures[GC_FAILURE_PIN] + 1); - - if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) { + if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) { f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials", - __func__, inode->i_ino, - fi->i_gc_failures[GC_FAILURE_PIN]); + __func__, inode->i_ino, fi->i_gc_failures); clear_inode_flag(inode, FI_PIN_FILE); return -EAGAIN; } + + /* Use i_gc_failures for normal file as a risk signal. */ + if (inc) + f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1); + return 0; } @@ -3234,7 +3264,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) goto done; } - if (f2fs_sb_has_blkzoned(sbi) && F2FS_HAS_BLOCKS(inode)) { + if (F2FS_HAS_BLOCKS(inode)) { ret = -EFBIG; goto out; } @@ -3261,7 +3291,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) } set_inode_flag(inode, FI_PIN_FILE); - ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; + ret = F2FS_I(inode)->i_gc_failures; done: f2fs_update_time(sbi, REQ_TIME); out: @@ -3276,7 +3306,7 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) __u32 pin = 0; if (is_inode_flag_set(inode, FI_PIN_FILE)) - pin = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; + pin = F2FS_I(inode)->i_gc_failures; return put_user(pin, (u32 __user *)arg); } @@ -3522,9 +3552,6 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (!f2fs_sb_has_compression(sbi)) return -EOPNOTSUPP; - if (!f2fs_compressed_file(inode)) - return -EINVAL; - if (f2fs_readonly(sbi->sb)) return -EROFS; @@ -3543,7 +3570,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) goto out; } - if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + if (!f2fs_compressed_file(inode) || + is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto out; } @@ -3570,9 +3598,12 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) struct dnode_of_data dn; pgoff_t end_offset, count; + f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); if (ret) { + f2fs_unlock_op(sbi); if (ret == -ENOENT) { page_idx = f2fs_get_next_page_offset(&dn, page_idx); @@ -3590,6 +3621,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + if (ret < 0) break; @@ -3600,6 +3633,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); out: + if (released_blocks) + f2fs_update_time(sbi, REQ_TIME); inode_unlock(inode); mnt_drop_write_file(filp); @@ -3641,7 +3676,8 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, while (count) { int compr_blocks = 0; - blkcnt_t reserved; + blkcnt_t reserved = 0; + blkcnt_t to_reserved; int ret; for (i = 0; i < cluster_size; i++) { @@ -3661,20 +3697,26 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, * fails in release_compress_blocks(), so NEW_ADDR * is a possible case. */ - if (blkaddr == NEW_ADDR || - __is_valid_data_blkaddr(blkaddr)) { + if (blkaddr == NEW_ADDR) { + reserved++; + continue; + } + if (__is_valid_data_blkaddr(blkaddr)) { compr_blocks++; continue; } } - reserved = cluster_size - compr_blocks; + to_reserved = cluster_size - compr_blocks - reserved; /* for the case all blocks in cluster were reserved */ - if (reserved == 1) + if (to_reserved == 1) { + dn->ofs_in_node += cluster_size; goto next; + } - ret = inc_valid_block_count(sbi, dn->inode, &reserved, false); + ret = inc_valid_block_count(sbi, dn->inode, + &to_reserved, false); if (unlikely(ret)) return ret; @@ -3685,7 +3727,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true); - *reserved_blocks += reserved; + *reserved_blocks += to_reserved; next: count -= cluster_size; } @@ -3704,9 +3746,6 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (!f2fs_sb_has_compression(sbi)) return -EOPNOTSUPP; - if (!f2fs_compressed_file(inode)) - return -EINVAL; - if (f2fs_readonly(sbi->sb)) return -EROFS; @@ -3718,7 +3757,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) inode_lock(inode); - if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + if (!f2fs_compressed_file(inode) || + !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto unlock_inode; } @@ -3735,9 +3775,12 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) struct dnode_of_data dn; pgoff_t end_offset, count; + f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); if (ret) { + f2fs_unlock_op(sbi); if (ret == -ENOENT) { page_idx = f2fs_get_next_page_offset(&dn, page_idx); @@ -3755,6 +3798,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + if (ret < 0) break; @@ -3770,6 +3815,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) f2fs_mark_inode_dirty_sync(inode, true); } unlock_inode: + if (reserved_blocks) + f2fs_update_time(sbi, REQ_TIME); inode_unlock(inode); mnt_drop_write_file(filp); @@ -3778,7 +3825,7 @@ unlock_inode: } else if (reserved_blocks && atomic_read(&F2FS_I(inode)->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + f2fs_warn(sbi, "%s: partial blocks were reserved i_ino=%lx " "iblocks=%llu, reserved=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, inode->i_blocks, @@ -3966,6 +4013,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) if (len) ret = f2fs_secure_erase(prev_bdev, inode, prev_index, prev_block, len, range.flags); + f2fs_update_time(sbi, REQ_TIME); out: filemap_invalidate_unlock(mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -4119,9 +4167,6 @@ static int f2fs_ioc_decompress_file(struct file *filp) if (!(filp->f_mode & FMODE_WRITE)) return -EBADF; - if (!f2fs_compressed_file(inode)) - return -EINVAL; - f2fs_balance_fs(sbi, true); file_start_write(filp); @@ -4132,7 +4177,8 @@ static int f2fs_ioc_decompress_file(struct file *filp) goto out; } - if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + if (!f2fs_compressed_file(inode) || + is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto out; } @@ -4175,6 +4221,7 @@ static int f2fs_ioc_decompress_file(struct file *filp) if (ret) f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.", __func__, ret); + f2fs_update_time(sbi, REQ_TIME); out: inode_unlock(inode); file_end_write(filp); @@ -4197,9 +4244,6 @@ static int f2fs_ioc_compress_file(struct file *filp) if (!(filp->f_mode & FMODE_WRITE)) return -EBADF; - if (!f2fs_compressed_file(inode)) - return -EINVAL; - f2fs_balance_fs(sbi, true); file_start_write(filp); @@ -4210,7 +4254,8 @@ static int f2fs_ioc_compress_file(struct file *filp) goto out; } - if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + if (!f2fs_compressed_file(inode) || + is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto out; } @@ -4254,6 +4299,7 @@ static int f2fs_ioc_compress_file(struct file *filp) if (ret) f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.", __func__, ret); + f2fs_update_time(sbi, REQ_TIME); out: inode_unlock(inode); file_end_write(filp); @@ -4612,7 +4658,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, map.m_may_create = true; if (dio) { - map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + map.m_seg_type = f2fs_rw_hint_to_seg_type(sbi, + inode->i_write_hint); flag = F2FS_GET_BLOCK_PRE_DIO; } else { map.m_seg_type = NO_CHECK_TYPE; @@ -4660,8 +4707,21 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, return 0; } +static void f2fs_dio_write_submit_io(const struct iomap_iter *iter, + struct bio *bio, loff_t file_offset) +{ + struct inode *inode = iter->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); + enum temp_type temp = f2fs_get_segment_temp(seg_type); + + bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp); + submit_bio(bio); +} + static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { - .end_io = f2fs_dio_write_end_io, + .end_io = f2fs_dio_write_end_io, + .submit_io = f2fs_dio_write_submit_io, }; static void f2fs_flush_buffered_write(struct address_space *mapping, @@ -4798,6 +4858,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) bool dio; bool may_need_sync = true; int preallocated; + const loff_t pos = iocb->ki_pos; + const ssize_t count = iov_iter_count(from); ssize_t ret; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { @@ -4819,6 +4881,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } + if (f2fs_is_pinned_file(inode) && + !f2fs_overwrite_io(inode, pos, count)) { + ret = -EIO; + goto out_unlock; + } + ret = f2fs_write_checks(iocb, from); if (ret <= 0) goto out_unlock; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8852814dab7f..6066c6eecf41 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1434,7 +1434,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; if (gc_type == BG_GC) { - if (PageWriteback(page)) { + if (folio_test_writeback(page_folio(page))) { err = -EAGAIN; goto out; } @@ -1554,10 +1554,15 @@ next_step: int err; inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode) || - special_file(inode->i_mode)) + if (IS_ERR(inode)) continue; + if (is_bad_inode(inode) || + special_file(inode->i_mode)) { + iput(inode); + continue; + } + err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err == -EAGAIN) { iput(inode); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 9c0d06c4d19a..a8ea3301b815 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -26,6 +26,7 @@ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ #define DEF_GC_FAILED_PINNED_FILES 2048 +#define MAX_GC_FAILED_PINNED_FILES USHRT_MAX /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ac00423f117b..7638d0d7b7ee 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -61,22 +61,22 @@ bool f2fs_may_inline_dentry(struct inode *inode) return true; } -void f2fs_do_read_inline_data(struct page *page, struct page *ipage) +void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio_file_mapping(folio)->host; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return; - f2fs_bug_on(F2FS_P_SB(page), page->index); + f2fs_bug_on(F2FS_I_SB(inode), folio_index(folio)); - zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); + folio_zero_segment(folio, MAX_INLINE_DATA(inode), folio_size(folio)); /* Copy the whole inline data block */ - memcpy_to_page(page, 0, inline_data_addr(inode, ipage), + memcpy_to_folio(folio, 0, inline_data_addr(inode, ipage), MAX_INLINE_DATA(inode)); - if (!PageUptodate(page)) - SetPageUptodate(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); } void f2fs_truncate_inline_inode(struct inode *inode, @@ -97,13 +97,13 @@ void f2fs_truncate_inline_inode(struct inode *inode, clear_inode_flag(inode, FI_DATA_EXIST); } -int f2fs_read_inline_data(struct inode *inode, struct page *page) +int f2fs_read_inline_data(struct inode *inode, struct folio *folio) { struct page *ipage; ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { - unlock_page(page); + folio_unlock(folio); return PTR_ERR(ipage); } @@ -112,15 +112,15 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) return -EAGAIN; } - if (page->index) - zero_user_segment(page, 0, PAGE_SIZE); + if (folio_index(folio)) + folio_zero_segment(folio, 0, folio_size(folio)); else - f2fs_do_read_inline_data(page, ipage); + f2fs_do_read_inline_data(folio, ipage); - if (!PageUptodate(page)) - SetPageUptodate(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); f2fs_put_page(ipage, 1); - unlock_page(page); + folio_unlock(folio); return 0; } @@ -164,9 +164,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) return -EFSCORRUPTED; } - f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); + f2fs_bug_on(F2FS_P_SB(page), folio_test_writeback(page_folio(page))); - f2fs_do_read_inline_data(page, dn->inode_page); + f2fs_do_read_inline_data(page_folio(page), dn->inode_page); set_page_dirty(page); /* clear dirty state */ diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c26effdce9aa..005dde72aff3 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -161,7 +161,8 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) if (!f2fs_enable_inode_chksum(sbi, page)) #else if (!f2fs_enable_inode_chksum(sbi, page) || - PageDirty(page) || PageWriteback(page)) + PageDirty(page) || + folio_test_writeback(page_folio(page))) #endif return true; @@ -361,6 +362,12 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (fi->i_xattr_nid && f2fs_check_nid_range(sbi, fi->i_xattr_nid)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_xattr_nid: %u, run fsck to fix.", + __func__, inode->i_ino, fi->i_xattr_nid); + return false; + } + return true; } @@ -408,8 +415,7 @@ static int do_read_inode(struct inode *inode) if (S_ISDIR(inode->i_mode)) fi->i_current_depth = le32_to_cpu(ri->i_current_depth); else if (S_ISREG(inode->i_mode)) - fi->i_gc_failures[GC_FAILURE_PIN] = - le16_to_cpu(ri->i_gc_failures); + fi->i_gc_failures = le16_to_cpu(ri->i_gc_failures); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); if (S_ISREG(inode->i_mode)) @@ -679,8 +685,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); else if (S_ISREG(inode->i_mode)) - ri->i_gc_failures = - cpu_to_le16(F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]); + ri->i_gc_failures = cpu_to_le16(F2FS_I(inode)->i_gc_failures); ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); @@ -804,6 +809,7 @@ void f2fs_evict_inode(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); nid_t xnid = fi->i_xattr_nid; int err = 0; + bool freeze_protected = false; f2fs_abort_atomic_write(inode, true); @@ -843,8 +849,10 @@ void f2fs_evict_inode(struct inode *inode) f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); - if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) { sb_start_intwrite(inode->i_sb); + freeze_protected = true; + } set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); retry: @@ -887,7 +895,7 @@ retry: if (dquot_initialize_needed(inode)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); } - if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + if (freeze_protected) sb_end_intwrite(inode->i_sb); no_delete: dquot_drop(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b3de6d6cdb02..b72ef96f7e33 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1187,7 +1187,17 @@ skip_partial: default: BUG(); } - if (err < 0 && err != -ENOENT) + if (err == -ENOENT) { + set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + f2fs_err_ratelimited(sbi, + "truncate node fail, ino:%lu, nid:%u, " + "offset[0]:%d, offset[1]:%d, nofs:%d", + inode->i_ino, dn.nid, offset[0], + offset[1], nofs); + err = 0; + } + if (err < 0) goto fail; if (offset[1] == 0 && ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { @@ -1319,6 +1329,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) } if (unlikely(new_ni.blk_addr != NULL_ADDR)) { err = -EFSCORRUPTED; + dec_valid_node_count(sbi, dn->inode, !ofs); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; @@ -1345,7 +1356,6 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) if (ofs == 0) inc_valid_inode_count(sbi); return page; - fail: clear_node_page_dirty(page); f2fs_put_page(page, 1); @@ -1614,7 +1624,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, }; unsigned int seq; - trace_f2fs_writepage(page, NODE); + trace_f2fs_writepage(page_folio(page), NODE); if (unlikely(f2fs_cp_error(sbi))) { /* keep node pages in remount-ro mode */ @@ -1733,7 +1743,7 @@ int f2fs_move_node_page(struct page *node_page, int gc_type) goto release_page; } else { /* set page dirty and write it */ - if (!PageWriteback(node_page)) + if (!folio_test_writeback(page_folio(node_page))) set_page_dirty(node_page); } out_page: @@ -2161,7 +2171,7 @@ skip_write: static bool f2fs_dirty_node_folio(struct address_space *mapping, struct folio *folio) { - trace_f2fs_set_page_dirty(&folio->page, NODE); + trace_f2fs_set_page_dirty(folio, NODE); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e7bf15b8240a..496aee53c38a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -330,8 +330,7 @@ static int recover_inode(struct inode *inode, struct page *page) F2FS_I(inode)->i_advise = raw->i_advise; F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags); f2fs_set_inode_flags(inode); - F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = - le16_to_cpu(raw->i_gc_failures); + F2FS_I(inode)->i_gc_failures = le16_to_cpu(raw->i_gc_failures); recover_inline_flags(inode, raw); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4fd76e867e0a..a0ce3d080f80 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -771,8 +771,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, block_t valid_blocks = get_valid_blocks(sbi, segno, true); - f2fs_bug_on(sbi, unlikely(!valid_blocks || - valid_blocks == CAP_BLKS_PER_SEC(sbi))); + f2fs_bug_on(sbi, + (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + !valid_blocks) || + valid_blocks == CAP_BLKS_PER_SEC(sbi)); if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); @@ -1109,9 +1111,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, dc->error = 0; if (dc->error) - printk_ratelimited( - "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d", - KERN_INFO, sbi->sb->s_id, + f2fs_info_ratelimited(sbi, + "Issue discard(%u, %u, %u) failed, ret: %d", dc->di.lstart, dc->di.start, dc->di.len, dc->error); __detach_discard_cmd(dcc, dc); } @@ -2645,7 +2646,7 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, } static int is_next_segment_free(struct f2fs_sb_info *sbi, - struct curseg_info *curseg, int type) + struct curseg_info *curseg) { unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); @@ -3073,8 +3074,7 @@ static bool need_new_seg(struct f2fs_sb_info *sbi, int type) if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && curseg->seg_type == CURSEG_WARM_NODE) return true; - if (curseg->alloc_type == LFS && - is_next_segment_free(sbi, curseg, type) && + if (curseg->alloc_type == LFS && is_next_segment_free(sbi, curseg) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) @@ -3352,8 +3352,14 @@ out: return err; } -int f2fs_rw_hint_to_seg_type(enum rw_hint hint) +int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint) { + if (F2FS_OPTION(sbi).active_logs == 2) + return CURSEG_HOT_DATA; + else if (F2FS_OPTION(sbi).active_logs == 4) + return CURSEG_COLD_DATA; + + /* active_log == 6 */ switch (hint) { case WRITE_LIFE_SHORT: return CURSEG_HOT_DATA; @@ -3364,6 +3370,65 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint) } } +/* + * This returns write hints for each segment type. This hints will be + * passed down to block layer as below by default. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_NONE|REQ_META + * HOT_NODE WRITE_LIFE_NONE + * WARM_NODE WRITE_LIFE_MEDIUM + * COLD_NODE WRITE_LIFE_LONG + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * COLD_DATA WRITE_LIFE_EXTREME + * HOT_DATA WRITE_LIFE_SHORT + * WARM_DATA WRITE_LIFE_NOT_SET + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + */ +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + switch (type) { + case DATA: + switch (temp) { + case WARM: + return WRITE_LIFE_NOT_SET; + case HOT: + return WRITE_LIFE_SHORT; + case COLD: + return WRITE_LIFE_EXTREME; + default: + return WRITE_LIFE_NONE; + } + case NODE: + switch (temp) { + case WARM: + return WRITE_LIFE_MEDIUM; + case HOT: + return WRITE_LIFE_NONE; + case COLD: + return WRITE_LIFE_LONG; + default: + return WRITE_LIFE_NONE; + } + case META: + return WRITE_LIFE_NONE; + default: + return WRITE_LIFE_NONE; + } +} + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -3434,7 +3499,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode)) return CURSEG_HOT_DATA; - return f2fs_rw_hint_to_seg_type(inode->i_write_hint); + return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), + inode->i_write_hint); } else { if (IS_DNODE(fio->page)) return is_cold_node(fio->page) ? CURSEG_WARM_NODE : @@ -3443,6 +3509,15 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) } } +int f2fs_get_segment_temp(int seg_type) +{ + if (IS_HOT(seg_type)) + return HOT; + else if (IS_WARM(seg_type)) + return WARM; + return COLD; +} + static int __get_segment_type(struct f2fs_io_info *fio) { int type = 0; @@ -3461,12 +3536,8 @@ static int __get_segment_type(struct f2fs_io_info *fio) f2fs_bug_on(fio->sbi, true); } - if (IS_HOT(type)) - fio->temp = HOT; - else if (IS_WARM(type)) - fio->temp = WARM; - else - fio->temp = COLD; + fio->temp = f2fs_get_segment_temp(type); + return type; } @@ -3559,6 +3630,8 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (segment_full) { if (type == CURSEG_COLD_DATA_PINNED && !((curseg->segno + 1) % sbi->segs_per_sec)) { + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); reset_curseg_fields(curseg); goto skip_new_segment; } @@ -3612,13 +3685,13 @@ skip_new_segment: mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return 0; + out_err: *new_blkaddr = NULL_ADDR; up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; - } void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, @@ -3660,8 +3733,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) &fio->new_blkaddr, sum, type, fio)) { if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); - if (PageWriteback(fio->page)) - end_page_writeback(fio->page); + end_page_writeback(fio->page); if (f2fs_in_warm_node_list(fio->sbi, fio->page)) f2fs_del_fsync_node_entry(fio->sbi, fio->page); goto out; @@ -3904,7 +3976,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked) { - if (PageWriteback(page)) { + if (folio_test_writeback(page_folio(page))) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); /* submit cached LFS IO */ @@ -3913,7 +3985,8 @@ void f2fs_wait_on_page_writeback(struct page *page, f2fs_submit_merged_ipu_write(sbi, NULL, page); if (ordered) { wait_on_page_writeback(page); - f2fs_bug_on(sbi, locked && PageWriteback(page)); + f2fs_bug_on(sbi, locked && + folio_test_writeback(page_folio(page))); } else { wait_for_stable_page(page); } @@ -4959,17 +5032,6 @@ out: } #ifdef CONFIG_BLK_DEV_ZONED -static const char *f2fs_zone_status[BLK_ZONE_COND_OFFLINE + 1] = { - [BLK_ZONE_COND_NOT_WP] = "NOT_WP", - [BLK_ZONE_COND_EMPTY] = "EMPTY", - [BLK_ZONE_COND_IMP_OPEN] = "IMPLICIT_OPEN", - [BLK_ZONE_COND_EXP_OPEN] = "EXPLICIT_OPEN", - [BLK_ZONE_COND_CLOSED] = "CLOSED", - [BLK_ZONE_COND_READONLY] = "READONLY", - [BLK_ZONE_COND_FULL] = "FULL", - [BLK_ZONE_COND_OFFLINE] = "OFFLINE", -}; - static int check_zone_write_pointer(struct f2fs_sb_info *sbi, struct f2fs_dev_info *fdev, struct blk_zone *zone) @@ -5000,7 +5062,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, - f2fs_zone_status[zone->cond]); + blk_zone_cond_str(zone->cond)); return 0; } @@ -5011,7 +5073,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, if (!valid_block_cnt) { f2fs_notice(sbi, "Zone without valid block has non-zero write " "pointer. Reset the write pointer: cond[%s]", - f2fs_zone_status[zone->cond]); + blk_zone_cond_str(zone->cond)); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, zone->len >> log_sectors_per_block); if (ret) @@ -5029,7 +5091,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, */ f2fs_notice(sbi, "Valid blocks are not aligned with write " "pointer: valid block[0x%x,0x%x] cond[%s]", - zone_segno, valid_block_cnt, f2fs_zone_status[zone->cond]); + zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a4bc26dfdb1a..1f1b3647a998 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -66,21 +66,31 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_NO_SEGMENT] = "no free segment", }; -void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, - unsigned int type) +int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, + unsigned long type) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (rate) { + if (rate > INT_MAX) + return -EINVAL; atomic_set(&ffi->inject_ops, 0); - ffi->inject_rate = rate; + ffi->inject_rate = (int)rate; } - if (type) - ffi->inject_type = type; + if (type) { + if (type >= BIT(FAULT_MAX)) + return -EINVAL; + ffi->inject_type = (unsigned int)type; + } if (!rate && !type) memset(ffi, 0, sizeof(struct f2fs_fault_info)); + else + f2fs_info(sbi, + "build fault injection attr: rate: %lu, type: 0x%lx", + rate, type); + return 0; } #endif @@ -886,14 +896,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; - f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE); + if (f2fs_build_fault_attr(sbi, arg, + F2FS_ALL_FAULT_TYPE)) + return -EINVAL; set_opt(sbi, FAULT_INJECTION); break; case Opt_fault_type: if (args->from && match_int(args, &arg)) return -EINVAL; - f2fs_build_fault_attr(sbi, 0, arg); + if (f2fs_build_fault_attr(sbi, 0, arg)) + return -EINVAL; set_opt(sbi, FAULT_INJECTION); break; #else @@ -2132,8 +2145,6 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; F2FS_OPTION(sbi).errors = MOUNT_ERRORS_CONTINUE; - sbi->sb->s_flags &= ~SB_INLINECRYPT; - set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); @@ -2326,6 +2337,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (err) goto restore_opts; +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && + sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { + f2fs_err(sbi, + "zoned: max open zones %u is too small, need at least %u open zones", + sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); + err = -EINVAL; + goto restore_opts; + } +#endif + /* flush outstanding errors before changing fs state */ flush_work(&sbi->s_error_work); @@ -2547,6 +2569,11 @@ restore_opts: return err; } +static void f2fs_shutdown(struct super_block *sb) +{ + f2fs_do_shutdown(F2FS_SB(sb), F2FS_GOING_DOWN_NOSYNC, false); +} + #ifdef CONFIG_QUOTA static bool f2fs_need_recovery(struct f2fs_sb_info *sbi) { @@ -3146,6 +3173,7 @@ static const struct super_operations f2fs_sops = { .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, .remount_fs = f2fs_remount, + .shutdown = f2fs_shutdown, }; #ifdef CONFIG_FS_ENCRYPTION @@ -3441,7 +3469,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } } - /* Currently, support only 4KB block size */ + /* only support block_size equals to PAGE_SIZE */ if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) { f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u", le32_to_cpu(raw_super->log_blocksize), @@ -3862,11 +3890,24 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; u64 zone_sectors; + unsigned int max_open_zones; int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; + if (bdev_is_zoned(FDEV(devi).bdev)) { + max_open_zones = bdev_max_open_zones(bdev); + if (max_open_zones && (max_open_zones < sbi->max_open_zones)) + sbi->max_open_zones = max_open_zones; + if (sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { + f2fs_err(sbi, + "zoned: max open zones %u is too small, need at least %u open zones", + sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); + return -EINVAL; + } + } + zone_sectors = bdev_zone_sectors(bdev); if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != SECTOR_TO_BLOCK(zone_sectors)) @@ -4131,9 +4172,15 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, if (shutdown) set_sbi_flag(sbi, SBI_IS_SHUTDOWN); - /* continue filesystem operators if errors=continue */ - if (continue_fs || f2fs_readonly(sb)) + /* + * Continue filesystem operators if errors=continue. Should not set + * RO by shutdown, since RO bypasses thaw_super which can hang the + * system. + */ + if (continue_fs || f2fs_readonly(sb) || shutdown) { + f2fs_warn(sbi, "Stopped filesystem due to reason: %d", reason); return; + } f2fs_warn(sbi, "Remounting filesystem read-only"); /* @@ -4180,6 +4227,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); sbi->aligned_blksize = true; +#ifdef CONFIG_BLK_DEV_ZONED + sbi->max_open_zones = UINT_MAX; +#endif for (i = 0; i < max_devices; i++) { if (i == 0) @@ -4894,12 +4944,6 @@ static int __init init_f2fs_fs(void) { int err; - if (PAGE_SIZE != F2FS_BLKSIZE) { - printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n", - PAGE_SIZE, F2FS_BLKSIZE); - return -EINVAL; - } - err = init_inodecache(); if (err) goto fail; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index a568ce96cf56..09d3ecfaa4f1 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -484,10 +484,16 @@ out: if (ret < 0) return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX)) - return -EINVAL; - if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) - return -EINVAL; + if (a->struct_type == FAULT_INFO_TYPE) { + if (f2fs_build_fault_attr(sbi, 0, t)) + return -EINVAL; + return count; + } + if (a->struct_type == FAULT_INFO_RATE) { + if (f2fs_build_fault_attr(sbi, t, 0)) + return -EINVAL; + return count; + } #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); @@ -675,6 +681,13 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_pin_file_threshold")) { + if (t > MAX_GC_FAILED_PINNED_FILES) + return -EINVAL; + sbi->gc_pin_file_threshold = t; + return count; + } + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { if (t != 0) return -EINVAL; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 00235b8a1823..acbec5bdd521 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -269,6 +269,18 @@ enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, }; /** * fat_parse_long - Parse extended directory entry. * + * @dir: Pointer to the inode that represents the directory. + * @pos: On input, contains the starting position to read from. + * On output, updated with the new position. + * @bh: Pointer to the buffer head that may be used for reading directory + * entries. May be updated. + * @de: On input, points to the current directory entry. + * On output, points to the next directory entry. + * @unicode: Pointer to a buffer where the parsed Unicode long filename will be + * stored. + * @nr_slots: Pointer to a variable that will store the number of longname + * slots found. + * * This function returns zero on success, negative value on error, or one of * the following: * diff --git a/fs/file.c b/fs/file.c index 3b683b9101d8..8076aef9c210 100644 --- a/fs/file.c +++ b/fs/file.c @@ -271,6 +271,11 @@ static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } +static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) +{ + return test_bit(fd, fdt->open_fds); +} + static unsigned int count_open_files(struct fdtable *fdt) { unsigned int size = fdt->max_fds; @@ -915,13 +920,8 @@ struct file *get_file_rcu(struct file __rcu **f) struct file __rcu *file; file = __get_file_rcu(f); - if (unlikely(!file)) - return NULL; - - if (unlikely(IS_ERR(file))) - continue; - - return file; + if (!IS_ERR(file)) + return file; } } EXPORT_SYMBOL_GPL(get_file_rcu); @@ -1219,12 +1219,9 @@ void set_close_on_exec(unsigned int fd, int flag) bool get_close_on_exec(unsigned int fd) { - struct files_struct *files = current->files; - struct fdtable *fdt; bool res; rcu_read_lock(); - fdt = files_fdtable(files); - res = close_on_exec(fd, fdt); + res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3ec8bb5e68ff..9eb191b5c4de 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1813,7 +1813,8 @@ static void fuse_resend(struct fuse_conn *fc) spin_unlock(&fc->lock); list_for_each_entry_safe(req, next, &to_queue, list) { - __set_bit(FR_PENDING, &req->flags); + set_bit(FR_PENDING, &req->flags); + clear_bit(FR_SENT, &req->flags); /* mark the request as resend request */ req->in.h.unique |= FUSE_UNIQUE_RESEND; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b57ce4157640..f39456c65ed7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -935,14 +935,10 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, } for (i = 0; i < ap->num_pages; i++) { - struct page *page = ap->pages[i]; + struct folio *folio = page_folio(ap->pages[i]); - if (!err) - SetPageUptodate(page); - else - SetPageError(page); - unlock_page(page); - put_page(page); + folio_end_read(folio, !err); + folio_put(folio); } if (ia->ff) fuse_file_put(ia->ff, false); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 726640fa439e..572ce8a82ceb 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -8,6 +8,7 @@ #include <linux/uio.h> #include <linux/compat.h> #include <linux/fileattr.h> +#include <linux/fsverity.h> static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, struct fuse_ioctl_out *outarg) @@ -117,6 +118,53 @@ static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, return 0; } +/* For fs-verity, determine iov lengths from input */ +static int fuse_setup_measure_verity(unsigned long arg, struct iovec *iov) +{ + __u16 digest_size; + struct fsverity_digest __user *uarg = (void __user *)arg; + + if (copy_from_user(&digest_size, &uarg->digest_size, sizeof(digest_size))) + return -EFAULT; + + if (digest_size > SIZE_MAX - sizeof(struct fsverity_digest)) + return -EINVAL; + + iov->iov_len = sizeof(struct fsverity_digest) + digest_size; + + return 0; +} + +static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov, + unsigned int *in_iovs) +{ + struct fsverity_enable_arg enable; + struct fsverity_enable_arg __user *uarg = (void __user *)arg; + const __u32 max_buffer_len = FUSE_MAX_MAX_PAGES * PAGE_SIZE; + + if (copy_from_user(&enable, uarg, sizeof(enable))) + return -EFAULT; + + if (enable.salt_size > max_buffer_len || enable.sig_size > max_buffer_len) + return -ENOMEM; + + if (enable.salt_size > 0) { + iov++; + (*in_iovs)++; + + iov->iov_base = u64_to_user_ptr(enable.salt_ptr); + iov->iov_len = enable.salt_size; + } + + if (enable.sig_size > 0) { + iov++; + (*in_iovs)++; + + iov->iov_base = u64_to_user_ptr(enable.sig_ptr); + iov->iov_len = enable.sig_size; + } + return 0; +} /* * For ioctls, there is no generic way to determine how much memory @@ -227,6 +275,18 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, out_iov = iov; out_iovs = 1; } + + err = 0; + switch (cmd) { + case FS_IOC_MEASURE_VERITY: + err = fuse_setup_measure_verity(arg, iov); + break; + case FS_IOC_ENABLE_VERITY: + err = fuse_setup_enable_verity(arg, iov, &in_iovs); + break; + } + if (err) + goto out; } retry: diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index bb3e941b9503..1a52a51b6b07 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -7,6 +7,8 @@ #include <linux/fs.h> #include <linux/dax.h> #include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/group_cpus.h> #include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/module.h> @@ -67,6 +69,8 @@ struct virtio_fs { unsigned int num_request_queues; /* number of request queues */ struct dax_device *dax_dev; + unsigned int *mq_map; /* index = cpu id, value = request vq id */ + /* DAX memory window where file contents are mapped */ void *window_kaddr; phys_addr_t window_phys_addr; @@ -185,6 +189,7 @@ static void virtio_fs_ktype_release(struct kobject *kobj) { struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj); + kfree(vfs->mq_map); kfree(vfs->vqs); kfree(vfs); } @@ -706,6 +711,44 @@ static void virtio_fs_requests_done_work(struct work_struct *work) } } +static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs) +{ + const struct cpumask *mask, *masks; + unsigned int q, cpu; + + /* First attempt to map using existing transport layer affinities + * e.g. PCIe MSI-X + */ + if (!vdev->config->get_vq_affinity) + goto fallback; + + for (q = 0; q < fs->num_request_queues; q++) { + mask = vdev->config->get_vq_affinity(vdev, VQ_REQUEST + q); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + fs->mq_map[cpu] = q; + } + + return; +fallback: + /* Attempt to map evenly in groups over the CPUs */ + masks = group_cpus_evenly(fs->num_request_queues); + /* If even this fails we default to all CPUs use queue zero */ + if (!masks) { + for_each_possible_cpu(cpu) + fs->mq_map[cpu] = 0; + return; + } + + for (q = 0; q < fs->num_request_queues; q++) { + for_each_cpu(cpu, &masks[q]) + fs->mq_map[cpu] = q; + } + kfree(masks); +} + /* Virtqueue interrupt handler */ static void virtio_fs_vq_done(struct virtqueue *vq) { @@ -742,6 +785,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, { struct virtqueue **vqs; vq_callback_t **callbacks; + /* Specify pre_vectors to ensure that the queues before the + * request queues (e.g. hiprio) don't claim any of the CPUs in + * the multi-queue mapping and interrupt affinities + */ + struct irq_affinity desc = { .pre_vectors = VQ_REQUEST }; const char **names; unsigned int i; int ret = 0; @@ -751,6 +799,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, if (fs->num_request_queues == 0) return -EINVAL; + /* Truncate nr of request queues to nr_cpu_id */ + fs->num_request_queues = min_t(unsigned int, fs->num_request_queues, + nr_cpu_ids); fs->nvqs = VQ_REQUEST + fs->num_request_queues; fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); if (!fs->vqs) @@ -760,7 +811,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]), GFP_KERNEL); names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL); - if (!vqs || !callbacks || !names) { + fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL, + dev_to_node(&vdev->dev)); + if (!vqs || !callbacks || !names || !fs->mq_map) { ret = -ENOMEM; goto out; } @@ -780,7 +833,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, names[i] = fs->vqs[i].name; } - ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL); + ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, &desc); if (ret < 0) goto out; @@ -792,8 +845,10 @@ out: kfree(names); kfree(callbacks); kfree(vqs); - if (ret) + if (ret) { kfree(fs->vqs); + kfree(fs->mq_map); + } return ret; } @@ -939,7 +994,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) if (ret < 0) goto out; - /* TODO vq affinity */ + virtio_fs_map_queues(vdev, fs); ret = virtio_fs_setup_dax(vdev, fs); if (ret < 0) @@ -1023,7 +1078,6 @@ static const unsigned int feature_table[] = {}; static struct virtio_driver virtio_fs_driver = { .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .feature_table = feature_table, .feature_table_size = ARRAY_SIZE(feature_table), @@ -1288,7 +1342,7 @@ out: static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) __releases(fiq->lock) { - unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ + unsigned int queue_id; struct virtio_fs *fs; struct fuse_req *req; struct virtio_fs_vq *fsvq; @@ -1302,11 +1356,13 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); fs = fiq->priv; + queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()]; - pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", - __func__, req->in.h.opcode, req->in.h.unique, + pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n", + __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid, req->in.h.len, - fuse_len_args(req->args->out_numargs, req->args->out_args)); + fuse_len_args(req->args->out_numargs, req->args->out_args), + queue_id); fsvq = &fs->vqs[queue_id]; ret = virtio_fs_enqueue_req(fsvq, req, false); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9f11fc1e79eb..4ea6c8bfb4e6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1267,7 +1267,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping = gfs2_glock2aspace(gl); if (mapping) { mapping->a_ops = &gfs2_meta_aops; - mapping->host = s->s_bdev->bd_inode; + mapping->host = s->s_bdev->bd_mapping->host; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->i_private_data = NULL; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 227edbaddfbc..05975ec76d35 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -114,7 +114,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) address_space_init_once(mapping); mapping->a_ops = &gfs2_rgrp_aops; - mapping->host = sb->s_bdev->bd_inode; + mapping->host = sb->s_bdev->bd_mapping->host; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->i_private_data = NULL; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 34ac73cc36b1..412f295acebe 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -176,14 +176,12 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; - info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; info.high_limit = arch_get_mmap_end(addr, len, flags); info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; return vm_unmapped_area(&info); } @@ -192,14 +190,13 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; addr = vm_unmapped_area(&info); /* @@ -249,11 +246,11 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } /* - * Use mm->get_unmapped_area value as a hint to use topdown routine. + * Use MMF_TOPDOWN flag as a hint to use topdown routine. * If architectures have special needs, they should define their own * version of hugetlb_get_unmapped_area. */ - if (mm->get_unmapped_area == arch_get_unmapped_area_topdown) + if (test_bit(MMF_TOPDOWN, &mm->flags)) return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); return hugetlb_get_unmapped_area_bottomup(file, addr, len, diff --git a/fs/internal.h b/fs/internal.h index 7ca738904e34..ab2225136f60 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -62,6 +62,9 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode); int do_symlinkat(struct filename *from, int newdfd, struct filename *to); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags); +int vfs_tmpfile(struct mnt_idmap *idmap, + const struct path *parentpath, + struct file *file, umode_t mode); /* * namespace.c diff --git a/fs/ioctl.c b/fs/ioctl.c index fb0628e680c4..64776891120c 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -796,6 +796,9 @@ static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp) * * When you add any new common ioctls to the switches above and below, * please ensure they have compatible arguments in compat mode. + * + * The LSM mailing list should also be notified of any command additions or + * changes, as specific LSMs may be affected. */ static int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg) diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index fc070184b7fa..381d76c5c232 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -4,7 +4,7 @@ # All Rights Reserved. # -ccflags-y += -I $(srctree)/$(src) # needed for trace events +ccflags-y += -I $(src) # needed for trace events obj-$(CONFIG_FS_IOMAP) += iomap.o diff --git a/fs/isofs/Makefile b/fs/isofs/Makefile index 6498fd2b0f60..b25bc542a22b 100644 --- a/fs/isofs/Makefile +++ b/fs/isofs/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_ISO9660_FS) += isofs.o -isofs-objs-y := namei.o inode.o dir.o util.o rock.o export.o -isofs-objs-$(CONFIG_JOLIET) += joliet.o -isofs-objs-$(CONFIG_ZISOFS) += compress.o -isofs-objs := $(isofs-objs-y) +isofs-y := namei.o inode.o dir.o util.o rock.o export.o +isofs-$(CONFIG_JOLIET) += joliet.o +isofs-$(CONFIG_ZISOFS) += compress.o diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index c4da3f634b92..34d5baa5d88a 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -346,8 +346,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) for (i = 0; i < pcount; i++, index++) { if (i != full_page) pages[i] = grab_cache_page_nowait(mapping, index); - if (pages[i]) - ClearPageError(pages[i]); } err = zisofs_fill_pages(inode, full_page, pcount, pages); @@ -356,8 +354,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) for (i = 0; i < pcount; i++) { if (pages[i]) { flush_dcache_page(pages[i]); - if (i == full_page && err) - SetPageError(pages[i]); unlock_page(pages[i]); if (i != full_page) put_page(pages[i]); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 2a616a9f289d..93b1077a380a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -21,11 +21,12 @@ #include <linux/ctype.h> #include <linux/statfs.h> #include <linux/cdrom.h> -#include <linux/parser.h> #include <linux/mpage.h> #include <linux/user_namespace.h> #include <linux/seq_file.h> #include <linux/blkdev.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "isofs.h" #include "zisofs.h" @@ -110,10 +111,10 @@ static void destroy_inodecache(void) kmem_cache_destroy(isofs_inode_cachep); } -static int isofs_remount(struct super_block *sb, int *flags, char *data) +static int isofs_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - if (!(*flags & SB_RDONLY)) + sync_filesystem(fc->root->d_sb); + if (!(fc->sb_flags & SB_RDONLY)) return -EROFS; return 0; } @@ -123,7 +124,6 @@ static const struct super_operations isofs_sops = { .free_inode = isofs_free_inode, .put_super = isofs_put_super, .statfs = isofs_statfs, - .remount_fs = isofs_remount, .show_options = isofs_show_options, }; @@ -145,7 +145,7 @@ static const struct dentry_operations isofs_dentry_ops[] = { #endif }; -struct iso9660_options{ +struct isofs_options{ unsigned int rock:1; unsigned int joliet:1; unsigned int cruft:1; @@ -289,197 +289,161 @@ isofs_dentry_cmpi_ms(const struct dentry *dentry, #endif enum { - Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore, - Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet, - Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err, - Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm, + Opt_block, Opt_check, Opt_cruft, Opt_gid, Opt_ignore, Opt_iocharset, + Opt_map, Opt_mode, Opt_nojoliet, Opt_norock, Opt_sb, Opt_session, + Opt_uid, Opt_unhide, Opt_utf8, Opt_err, Opt_nocompress, Opt_hide, + Opt_showassoc, Opt_dmode, Opt_overriderockperm, }; -static const match_table_t tokens = { - {Opt_norock, "norock"}, - {Opt_nojoliet, "nojoliet"}, - {Opt_unhide, "unhide"}, - {Opt_hide, "hide"}, - {Opt_showassoc, "showassoc"}, - {Opt_cruft, "cruft"}, - {Opt_utf8, "utf8"}, - {Opt_iocharset, "iocharset=%s"}, - {Opt_map_a, "map=acorn"}, - {Opt_map_a, "map=a"}, - {Opt_map_n, "map=normal"}, - {Opt_map_n, "map=n"}, - {Opt_map_o, "map=off"}, - {Opt_map_o, "map=o"}, - {Opt_session, "session=%u"}, - {Opt_sb, "sbsector=%u"}, - {Opt_check_r, "check=relaxed"}, - {Opt_check_r, "check=r"}, - {Opt_check_s, "check=strict"}, - {Opt_check_s, "check=s"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_mode, "mode=%u"}, - {Opt_dmode, "dmode=%u"}, - {Opt_overriderockperm, "overriderockperm"}, - {Opt_block, "block=%u"}, - {Opt_ignore, "conv=binary"}, - {Opt_ignore, "conv=b"}, - {Opt_ignore, "conv=text"}, - {Opt_ignore, "conv=t"}, - {Opt_ignore, "conv=mtext"}, - {Opt_ignore, "conv=m"}, - {Opt_ignore, "conv=auto"}, - {Opt_ignore, "conv=a"}, - {Opt_nocompress, "nocompress"}, - {Opt_err, NULL} +static const struct constant_table isofs_param_map[] = { + {"acorn", 'a'}, + {"a", 'a'}, + {"normal", 'n'}, + {"n", 'n'}, + {"off", 'o'}, + {"o", 'o'}, + {} }; -static int parse_options(char *options, struct iso9660_options *popt) -{ - char *p; - int option; - unsigned int uv; - - popt->map = 'n'; - popt->rock = 1; - popt->joliet = 1; - popt->cruft = 0; - popt->hide = 0; - popt->showassoc = 0; - popt->check = 'u'; /* unset */ - popt->nocompress = 0; - popt->blocksize = 1024; - popt->fmode = popt->dmode = ISOFS_INVALID_MODE; - popt->uid_set = 0; - popt->gid_set = 0; - popt->gid = GLOBAL_ROOT_GID; - popt->uid = GLOBAL_ROOT_UID; - popt->iocharset = NULL; - popt->overriderockperm = 0; - popt->session=-1; - popt->sbsector=-1; - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - substring_t args[MAX_OPT_ARGS]; - unsigned n; - - if (!*p) - continue; +static const struct constant_table isofs_param_check[] = { + {"relaxed", 'r'}, + {"r", 'r'}, + {"strict", 's'}, + {"s", 's'}, + {} +}; - token = match_token(p, tokens, args); - switch (token) { - case Opt_norock: - popt->rock = 0; - break; - case Opt_nojoliet: - popt->joliet = 0; - break; - case Opt_hide: - popt->hide = 1; - break; - case Opt_unhide: - case Opt_showassoc: - popt->showassoc = 1; - break; - case Opt_cruft: - popt->cruft = 1; - break; +static const struct fs_parameter_spec isofs_param_spec[] = { + fsparam_flag ("norock", Opt_norock), + fsparam_flag ("nojoliet", Opt_nojoliet), + fsparam_flag ("unhide", Opt_unhide), + fsparam_flag ("hide", Opt_hide), + fsparam_flag ("showassoc", Opt_showassoc), + fsparam_flag ("cruft", Opt_cruft), + fsparam_flag ("utf8", Opt_utf8), + fsparam_string ("iocharset", Opt_iocharset), + fsparam_enum ("map", Opt_map, isofs_param_map), + fsparam_u32 ("session", Opt_session), + fsparam_u32 ("sbsector", Opt_sb), + fsparam_enum ("check", Opt_check, isofs_param_check), + fsparam_u32 ("uid", Opt_uid), + fsparam_u32 ("gid", Opt_gid), + /* Note: mode/dmode historically accepted %u not strictly %o */ + fsparam_u32 ("mode", Opt_mode), + fsparam_u32 ("dmode", Opt_dmode), + fsparam_flag ("overriderockperm", Opt_overriderockperm), + fsparam_u32 ("block", Opt_block), + fsparam_string ("conv", Opt_ignore), + fsparam_flag ("nocompress", Opt_nocompress), + {} +}; + +static int isofs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct isofs_options *popt = fc->fs_private; + struct fs_parse_result result; + int opt; + kuid_t uid; + kgid_t gid; + unsigned int n; + + /* There are no remountable options */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + return 0; + + opt = fs_parse(fc, isofs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_norock: + popt->rock = 0; + break; + case Opt_nojoliet: + popt->joliet = 0; + break; + case Opt_hide: + popt->hide = 1; + break; + case Opt_unhide: + case Opt_showassoc: + popt->showassoc = 1; + break; + case Opt_cruft: + popt->cruft = 1; + break; #ifdef CONFIG_JOLIET - case Opt_utf8: - kfree(popt->iocharset); - popt->iocharset = kstrdup("utf8", GFP_KERNEL); - if (!popt->iocharset) - return 0; - break; - case Opt_iocharset: - kfree(popt->iocharset); - popt->iocharset = match_strdup(&args[0]); - if (!popt->iocharset) - return 0; - break; + case Opt_utf8: + kfree(popt->iocharset); + popt->iocharset = kstrdup("utf8", GFP_KERNEL); + if (!popt->iocharset) + return -ENOMEM; + break; + case Opt_iocharset: + kfree(popt->iocharset); + popt->iocharset = kstrdup(param->string, GFP_KERNEL); + if (!popt->iocharset) + return -ENOMEM; + break; #endif - case Opt_map_a: - popt->map = 'a'; - break; - case Opt_map_o: - popt->map = 'o'; - break; - case Opt_map_n: - popt->map = 'n'; - break; - case Opt_session: - if (match_int(&args[0], &option)) - return 0; - n = option; - /* - * Track numbers are supposed to be in range 1-99, the - * mount option starts indexing at 0. - */ - if (n >= 99) - return 0; - popt->session = n + 1; - break; - case Opt_sb: - if (match_int(&args[0], &option)) - return 0; - popt->sbsector = option; - break; - case Opt_check_r: - popt->check = 'r'; - break; - case Opt_check_s: - popt->check = 's'; - break; - case Opt_ignore: - break; - case Opt_uid: - if (match_uint(&args[0], &uv)) - return 0; - popt->uid = make_kuid(current_user_ns(), uv); - if (!uid_valid(popt->uid)) - return 0; - popt->uid_set = 1; - break; - case Opt_gid: - if (match_uint(&args[0], &uv)) - return 0; - popt->gid = make_kgid(current_user_ns(), uv); - if (!gid_valid(popt->gid)) - return 0; - popt->gid_set = 1; - break; - case Opt_mode: - if (match_int(&args[0], &option)) - return 0; - popt->fmode = option; - break; - case Opt_dmode: - if (match_int(&args[0], &option)) - return 0; - popt->dmode = option; - break; - case Opt_overriderockperm: - popt->overriderockperm = 1; - break; - case Opt_block: - if (match_int(&args[0], &option)) - return 0; - n = option; - if (n != 512 && n != 1024 && n != 2048) - return 0; - popt->blocksize = n; - break; - case Opt_nocompress: - popt->nocompress = 1; - break; - default: - return 0; - } + case Opt_map: + popt->map = result.uint_32; + break; + case Opt_session: + n = result.uint_32; + /* + * Track numbers are supposed to be in range 1-99, the + * mount option starts indexing at 0. + */ + if (n >= 99) + return -EINVAL; + popt->session = n + 1; + break; + case Opt_sb: + popt->sbsector = result.uint_32; + break; + case Opt_check: + popt->check = result.uint_32; + break; + case Opt_ignore: + break; + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return -EINVAL; + popt->uid = uid; + popt->uid_set = 1; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return -EINVAL; + popt->gid = gid; + popt->gid_set = 1; + break; + case Opt_mode: + popt->fmode = result.uint_32; + break; + case Opt_dmode: + popt->dmode = result.uint_32; + break; + case Opt_overriderockperm: + popt->overriderockperm = 1; + break; + case Opt_block: + n = result.uint_32; + if (n != 512 && n != 1024 && n != 2048) + return -EINVAL; + popt->blocksize = n; + break; + case Opt_nocompress: + popt->nocompress = 1; + break; + default: + return -EINVAL; } - return 1; + return 0; } /* @@ -615,7 +579,7 @@ static bool rootdir_empty(struct super_block *sb, unsigned long block) /* * Initialize the superblock and read the root inode. */ -static int isofs_fill_super(struct super_block *s, void *data, int silent) +static int isofs_fill_super(struct super_block *s, struct fs_context *fc) { struct buffer_head *bh = NULL, *pri_bh = NULL; struct hs_primary_descriptor *h_pri = NULL; @@ -623,7 +587,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) struct iso_supplementary_descriptor *sec = NULL; struct iso_directory_record *rootp; struct inode *inode; - struct iso9660_options opt; + struct isofs_options *opt = fc->fs_private; struct isofs_sb_info *sbi; unsigned long first_data_zone; int joliet_level = 0; @@ -631,15 +595,13 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) int orig_zonesize; int table, error = -EINVAL; unsigned int vol_desc_start; + int silent = fc->sb_flags & SB_SILENT; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; s->s_fs_info = sbi; - if (!parse_options((char *)data, &opt)) - goto out_freesbi; - /* * First of all, get the hardware blocksize for this device. * If we don't know what it is, or the hardware blocksize is @@ -655,14 +617,14 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) bdev_logical_block_size(s->s_bdev)); goto out_freesbi; } - opt.blocksize = sb_min_blocksize(s, opt.blocksize); + opt->blocksize = sb_min_blocksize(s, opt->blocksize); sbi->s_high_sierra = 0; /* default is iso9660 */ - sbi->s_session = opt.session; - sbi->s_sbsector = opt.sbsector; + sbi->s_session = opt->session; + sbi->s_sbsector = opt->sbsector; - vol_desc_start = (opt.sbsector != -1) ? - opt.sbsector : isofs_get_last_session(s,opt.session); + vol_desc_start = (opt->sbsector != -1) ? + opt->sbsector : isofs_get_last_session(s, opt->session); for (iso_blknum = vol_desc_start+16; iso_blknum < vol_desc_start+100; iso_blknum++) { @@ -696,7 +658,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) { sec = (struct iso_supplementary_descriptor *)vdp; if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { - if (opt.joliet) { + if (opt->joliet) { if (sec->escape[2] == 0x40) joliet_level = 1; else if (sec->escape[2] == 0x43) @@ -721,7 +683,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) goto out_freebh; sbi->s_high_sierra = 1; - opt.rock = 0; + opt->rock = 0; h_pri = (struct hs_primary_descriptor *)vdp; goto root_found; } @@ -749,7 +711,7 @@ root_found: goto out_freebh; } - if (joliet_level && (!pri || !opt.rock)) { + if (joliet_level && (!pri || !opt->rock)) { /* This is the case of Joliet with the norock mount flag. * A disc with both Joliet and Rock Ridge is handled later */ @@ -780,7 +742,7 @@ root_found: * blocks that were 512 bytes (which should only very rarely * happen.) */ - if (orig_zonesize < opt.blocksize) + if (orig_zonesize < opt->blocksize) goto out_bad_size; /* RDE: convert log zone size to bit shift */ @@ -865,10 +827,10 @@ root_found: #ifdef CONFIG_JOLIET if (joliet_level) { - char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; + char *p = opt->iocharset ? opt->iocharset : CONFIG_NLS_DEFAULT; if (strcmp(p, "utf8") != 0) { - sbi->s_nls_iocharset = opt.iocharset ? - load_nls(opt.iocharset) : load_nls_default(); + sbi->s_nls_iocharset = opt->iocharset ? + load_nls(opt->iocharset) : load_nls_default(); if (!sbi->s_nls_iocharset) goto out_freesbi; } @@ -876,29 +838,29 @@ root_found: #endif s->s_op = &isofs_sops; s->s_export_op = &isofs_export_ops; - sbi->s_mapping = opt.map; - sbi->s_rock = (opt.rock ? 2 : 0); + sbi->s_mapping = opt->map; + sbi->s_rock = (opt->rock ? 2 : 0); sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/ - sbi->s_cruft = opt.cruft; - sbi->s_hide = opt.hide; - sbi->s_showassoc = opt.showassoc; - sbi->s_uid = opt.uid; - sbi->s_gid = opt.gid; - sbi->s_uid_set = opt.uid_set; - sbi->s_gid_set = opt.gid_set; - sbi->s_nocompress = opt.nocompress; - sbi->s_overriderockperm = opt.overriderockperm; + sbi->s_cruft = opt->cruft; + sbi->s_hide = opt->hide; + sbi->s_showassoc = opt->showassoc; + sbi->s_uid = opt->uid; + sbi->s_gid = opt->gid; + sbi->s_uid_set = opt->uid_set; + sbi->s_gid_set = opt->gid_set; + sbi->s_nocompress = opt->nocompress; + sbi->s_overriderockperm = opt->overriderockperm; /* * It would be incredibly stupid to allow people to mark every file * on the disk as suid, so we merely allow them to set the default * permissions. */ - if (opt.fmode != ISOFS_INVALID_MODE) - sbi->s_fmode = opt.fmode & 0777; + if (opt->fmode != ISOFS_INVALID_MODE) + sbi->s_fmode = opt->fmode & 0777; else sbi->s_fmode = ISOFS_INVALID_MODE; - if (opt.dmode != ISOFS_INVALID_MODE) - sbi->s_dmode = opt.dmode & 0777; + if (opt->dmode != ISOFS_INVALID_MODE) + sbi->s_dmode = opt->dmode & 0777; else sbi->s_dmode = ISOFS_INVALID_MODE; @@ -960,12 +922,12 @@ root_found: } } - if (opt.check == 'u') { + if (opt->check == 'u') { /* Only Joliet is case insensitive by default */ if (joliet_level) - opt.check = 'r'; + opt->check = 'r'; else - opt.check = 's'; + opt->check = 's'; } sbi->s_joliet_level = joliet_level; @@ -980,9 +942,9 @@ root_found: table = 0; if (joliet_level) table += 2; - if (opt.check == 'r') + if (opt->check == 'r') table++; - sbi->s_check = opt.check; + sbi->s_check = opt->check; if (table) s->s_d_op = &isofs_dentry_ops[table - 1]; @@ -994,7 +956,7 @@ root_found: goto out_no_inode; } - kfree(opt.iocharset); + kfree(opt->iocharset); return 0; @@ -1023,7 +985,7 @@ out_bad_zone_size: goto out_freebh; out_bad_size: printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n", - orig_zonesize, opt.blocksize); + orig_zonesize, opt->blocksize); goto out_freebh; out_unknown_format: if (!silent) @@ -1033,7 +995,7 @@ out_freebh: brelse(bh); brelse(pri_bh); out_freesbi: - kfree(opt.iocharset); + kfree(opt->iocharset); kfree(sbi); s->s_fs_info = NULL; return error; @@ -1567,18 +1529,63 @@ struct inode *__isofs_iget(struct super_block *sb, return inode; } -static struct dentry *isofs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int isofs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); + return get_tree_bdev(fc, isofs_fill_super); +} + +static void isofs_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations isofs_context_ops = { + .parse_param = isofs_parse_param, + .get_tree = isofs_get_tree, + .reconfigure = isofs_reconfigure, + .free = isofs_free_fc, +}; + +static int isofs_init_fs_context(struct fs_context *fc) +{ + struct isofs_options *opt; + + opt = kzalloc(sizeof(*opt), GFP_KERNEL); + if (!opt) + return -ENOMEM; + + opt->map = 'n'; + opt->rock = 1; + opt->joliet = 1; + opt->cruft = 0; + opt->hide = 0; + opt->showassoc = 0; + opt->check = 'u'; /* unset */ + opt->nocompress = 0; + opt->blocksize = 1024; + opt->fmode = opt->dmode = ISOFS_INVALID_MODE; + opt->uid_set = 0; + opt->gid_set = 0; + opt->gid = GLOBAL_ROOT_GID; + opt->uid = GLOBAL_ROOT_UID; + opt->iocharset = NULL; + opt->overriderockperm = 0; + opt->session = -1; + opt->sbsector = -1; + + fc->fs_private = opt; + fc->ops = &isofs_context_ops; + + return 0; } static struct file_system_type iso9660_fs_type = { .owner = THIS_MODULE, .name = "iso9660", - .mount = isofs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = isofs_init_fs_context, + .parameters = isofs_param_spec, }; MODULE_ALIAS_FS("iso9660"); MODULE_ALIAS("iso9660"); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 1c97e64c4784..951f78634adf 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -337,8 +337,6 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ -enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; - /* * journal_shrink_one_cp_list * @@ -350,7 +348,7 @@ enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - enum shrink_type type, + enum jbd2_shrink_type type, bool *released) { struct journal_head *last_jh; @@ -367,12 +365,12 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - if (type == SHRINK_DESTROY) { + if (type == JBD2_SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { ret = jbd2_journal_try_remove_checkpoint(jh); if (ret < 0) { - if (type == SHRINK_BUSY_SKIP) + if (type == JBD2_SHRINK_BUSY_SKIP) continue; break; } @@ -439,7 +437,7 @@ again: tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, - SHRINK_BUSY_SKIP, &released); + JBD2_SHRINK_BUSY_SKIP, &released); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) @@ -472,21 +470,25 @@ out: * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. - * If 'destroy' is set, release all buffers unconditionally. + * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If + * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a + * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some + * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function. * * Called with j_list_lock held. */ -void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, + enum jbd2_shrink_type type) { transaction_t *transaction, *last_transaction, *next_transaction; - enum shrink_type type; bool released; + WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP); + transaction = journal->j_checkpoint_transactions; if (!transaction) return; - type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { @@ -527,7 +529,7 @@ void jbd2_journal_destroy_checkpoint(journal_t *journal) spin_unlock(&journal->j_list_lock); break; } - __jbd2_journal_clean_checkpoint_list(journal, true); + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY); spin_unlock(&journal->j_list_lock); cond_resched(); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5e122586e06e..75ea4e9a5cab 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -501,7 +501,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * frees some memory */ spin_lock(&journal->j_list_lock); - __jbd2_journal_clean_checkpoint_list(journal, false); + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP); spin_unlock(&journal->j_list_lock); jbd2_debug(3, "JBD2: commit phase 1\n"); @@ -571,7 +571,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_nr_buffers <= atomic_read(&commit_transaction->t_outstanding_credits)); - err = 0; bufs = 0; descriptor = NULL; while (commit_transaction->t_buffers) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b6c114c11b97..03c4b9214f56 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2009,7 +2009,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) byte_count = (block_stop - block_start + 1) * journal->j_blocksize; - truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping, + truncate_inode_pages_range(journal->j_dev->bd_mapping, byte_start, byte_stop); if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e29f4edf9572..1358c21837f1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -206,7 +206,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb) { struct dentry *dentry; - struct kernfs_node *knparent = NULL; + struct kernfs_node *knparent; BUG_ON(sb->s_op != &kernfs_sops); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 127a728fcbc8..c11516801784 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -117,7 +117,6 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, if (nsm != NULL) refcount_inc(&nsm->sm_count); else { - host = NULL; nsm = nsm_get_handle(ni->net, ni->sap, ni->salen, ni->hostname, ni->hostname_len); if (unlikely(nsm == NULL)) { diff --git a/fs/namei.c b/fs/namei.c index cb5dde0e309f..37fb0a8aa09a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3676,9 +3676,9 @@ static int do_open(struct nameidata *nd, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ -static int vfs_tmpfile(struct mnt_idmap *idmap, - const struct path *parentpath, - struct file *file, umode_t mode) +int vfs_tmpfile(struct mnt_idmap *idmap, + const struct path *parentpath, + struct file *file, umode_t mode) { struct dentry *child; struct inode *dir = d_inode(parentpath->dentry); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f7e32d76e34d..57249f040dfc 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -33,12 +33,12 @@ config NFS_FS config NFS_V2 tristate "NFS client support for NFS version 2" depends on NFS_FS - default y + default n help This option enables support for version 2 of the NFS protocol (RFC 1094) in the kernel's NFS client. - If unsure, say Y. + If unsure, say N. config NFS_V3 tristate "NFS client support for NFS version 3" diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ac505671efbd..342930996226 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -56,6 +56,8 @@ static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); static void nfs_readdir_clear_array(struct folio *); +static int nfs_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, int open_flags); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, @@ -2243,6 +2245,41 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) #endif /* CONFIG_NFSV4 */ +int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned int open_flags, + umode_t mode) +{ + + /* Same as look+open from lookup_open(), but with different O_TRUNC + * handling. + */ + int error = 0; + + if (open_flags & O_CREAT) { + file->f_mode |= FMODE_CREATED; + error = nfs_do_create(dir, dentry, mode, open_flags); + if (error) + return error; + return finish_open(file, dentry, NULL); + } else if (d_in_lookup(dentry)) { + /* The only flags nfs_lookup considers are + * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and + * we want those to be zero so the lookup isn't skipped. + */ + struct dentry *res = nfs_lookup(dir, dentry, 0); + + d_lookup_done(dentry); + if (unlikely(res)) { + if (IS_ERR(res)) + return PTR_ERR(res); + return finish_no_open(file, res); + } + } + return finish_no_open(file, NULL); + +} +EXPORT_SYMBOL_GPL(nfs_atomic_open_v23); + struct dentry * nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr) @@ -2303,18 +2340,23 @@ EXPORT_SYMBOL_GPL(nfs_instantiate); * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -int nfs_create(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) +static int nfs_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, int open_flags) { struct iattr attr; - int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; int error; + open_flags |= O_CREAT; + dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; + if (open_flags & O_TRUNC) { + attr.ia_size = 0; + attr.ia_valid |= ATTR_SIZE; + } trace_nfs_create_enter(dir, dentry, open_flags); error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); @@ -2326,6 +2368,12 @@ out_err: d_drop(dentry); return error; } + +int nfs_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0); +} EXPORT_SYMBOL_GPL(nfs_create); /* diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index ce8f8934bca5..29d84dc66ca3 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -605,14 +605,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); - /* FIXME: remove this check when layout segment support is added */ - if (lgr->range.offset != 0 || - lgr->range.length != NFS4_MAX_UINT64) { - dprintk("%s Only whole file layouts supported. Use MDS i/o\n", - __func__); - goto out; - } - if (fl->pattern_offset > lgr->range.offset) { dprintk("%s pattern_offset %lld too large\n", __func__, fl->pattern_offset); @@ -875,15 +867,15 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), - 0, - NFS4_MAX_UINT64, + req_offset(req), + req->wb_bytes, IOMODE_READ, false, - GFP_KERNEL); + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -899,15 +891,15 @@ static void filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), - 0, - NFS4_MAX_UINT64, + req_offset(req), + req->wb_bytes, IOMODE_RW, false, - GFP_NOFS); + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 3e724cb7ef01..24188af56d5b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -823,14 +823,6 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, } static void -ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio, - struct nfs_page *req) -{ - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); -} - -static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { @@ -840,7 +832,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, u32 ds_idx; retry: - ff_layout_pg_check_layout(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); /* Use full layout for now */ if (!pgio->pg_lseg) { ff_layout_pg_get_read(pgio, req, false); @@ -895,7 +887,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, u32 i; retry: - ff_layout_pg_check_layout(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index d0a0956f8a13..6c9f3f6645dd 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -600,9 +600,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_lock: if (result.negated) { + ctx->lock_status = NFS_LOCK_NOLOCK; ctx->flags |= NFS_MOUNT_NONLM; ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } else { + ctx->lock_status = NFS_LOCK_LOCK; ctx->flags &= ~NFS_MOUNT_NONLM; ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } @@ -1112,9 +1114,12 @@ static int nfs23_parse_monolithic(struct fs_context *fc, ctx->acdirmax = data->acdirmax; ctx->need_mount = false; - memcpy(sap, &data->addr, sizeof(data->addr)); - ctx->nfs_server.addrlen = sizeof(data->addr); - ctx->nfs_server.port = ntohs(data->addr.sin_port); + if (!is_remount_fc(fc)) { + memcpy(sap, &data->addr, sizeof(data->addr)); + ctx->nfs_server.addrlen = sizeof(data->addr); + ctx->nfs_server.port = ntohs(data->addr.sin_port); + } + if (sap->ss_family != AF_INET || !nfs_verify_server_address(sap)) goto out_no_address; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 06253695fe53..9f0f4534744b 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -112,6 +112,7 @@ struct nfs_fs_context { unsigned short protofamily; unsigned short mountfamily; bool has_sec_mnt_opts; + int lock_status; struct { union { @@ -153,6 +154,12 @@ struct nfs_fs_context { } clone_data; }; +enum nfs_lock_status { + NFS_LOCK_NOT_SET = 0, + NFS_LOCK_LOCK = 1, + NFS_LOCK_NOLOCK = 2, +}; + #define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) @@ -710,9 +717,9 @@ unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) if ((bsize & (bsize - 1)) || nrbitsp) { unsigned char nrbits; - for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) + for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--) ; - bsize = 1 << nrbits; + bsize = 1UL << nrbits; if (nrbitsp) *nrbitsp = nrbits; } diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 5aa776b5a3e7..b17a9eb9b148 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -46,10 +46,7 @@ static inline void nfs_add_stats(const struct inode *inode, nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } -static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) -{ - return alloc_percpu(struct nfs_iostats); -} +#define nfs_alloc_iostats() alloc_percpu(struct nfs_iostats) static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats) { diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index cbbe3f0193b8..74bda639a7cf 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -986,6 +986,7 @@ static int nfs3_have_delegation(struct inode *inode, fmode_t flags) static const struct inode_operations nfs3_dir_inode_operations = { .create = nfs_create, + .atomic_open = nfs_atomic_open_v23, .lookup = nfs_lookup, .link = nfs_link, .unlink = nfs_unlink, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ea390db94b62..c93c12063b3a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5456,7 +5456,7 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task, struct rpc_message *msg = &task->tk_msg; if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] && - server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) { + task->tk_status == -ENOTSUPP) { server->caps &= ~NFS_CAP_READ_PLUS; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; rpc_restart_call_prepare(task); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 662e86ea3a2d..5b452411e8fd 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2116,6 +2116,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred { struct nfs_client *clp = server->nfs_client; struct nfs4_fs_locations *locations = NULL; + struct nfs_fattr *fattr; struct inode *inode; struct page *page; int status, result; @@ -2125,19 +2126,16 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred (unsigned long long)server->fsid.minor, clp->cl_hostname); - result = 0; page = alloc_page(GFP_KERNEL); locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); - if (page == NULL || locations == NULL) { - dprintk("<-- %s: no memory\n", __func__); - goto out; - } - locations->fattr = nfs_alloc_fattr(); - if (locations->fattr == NULL) { + fattr = nfs_alloc_fattr(); + if (page == NULL || locations == NULL || fattr == NULL) { dprintk("<-- %s: no memory\n", __func__); + result = 0; goto out; } + locations->fattr = fattr; inode = d_inode(server->super->s_root); result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, page, cred); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 10985a4b8259..4de8780a7c48 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -47,7 +47,7 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_fast_assign( __entry->error = error < 0 ? -error : 0; - __assign_str(dstaddr, clp->cl_hostname); + __assign_str(dstaddr); ), TP_printk( @@ -94,8 +94,8 @@ TRACE_EVENT(nfs4_trunked_exchange_id, TP_fast_assign( __entry->error = error < 0 ? -error : 0; - __assign_str(main_addr, clp->cl_hostname); - __assign_str(trunk_addr, addr); + __assign_str(main_addr); + __assign_str(trunk_addr); ), TP_printk( @@ -365,7 +365,7 @@ TRACE_EVENT(nfs4_state_mgr, TP_fast_assign( __entry->state = clp->cl_state; - __assign_str(hostname, clp->cl_hostname); + __assign_str(hostname); ), TP_printk( @@ -393,8 +393,8 @@ TRACE_EVENT(nfs4_state_mgr_failed, TP_fast_assign( __entry->error = status < 0 ? -status : 0; __entry->state = clp->cl_state; - __assign_str(hostname, clp->cl_hostname); - __assign_str(section, section); + __assign_str(hostname); + __assign_str(section); ), TP_printk( @@ -578,7 +578,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fhandle = 0; } __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent)); - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -1072,7 +1072,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = -error; - __assign_str(name, name->name); + __assign_str(name); ), TP_printk( @@ -1156,8 +1156,8 @@ TRACE_EVENT(nfs4_rename, __entry->olddir = NFS_FILEID(olddir); __entry->newdir = NFS_FILEID(newdir); __entry->error = error < 0 ? -error : 0; - __assign_str(oldname, oldname->name); - __assign_str(newname, newname->name); + __assign_str(oldname); + __assign_str(newname); ), TP_printk( @@ -1359,7 +1359,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); + __assign_str(dstaddr); ), TP_printk( @@ -1416,7 +1416,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); + __assign_str(dstaddr); __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = @@ -1960,7 +1960,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_event, ), TP_fast_assign( - __assign_str(dstaddr, clp->cl_hostname); + __assign_str(dstaddr); memcpy(__entry->deviceid, deviceid->data, NFS4_DEVICEID4_SIZE); ), @@ -1998,7 +1998,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status, TP_fast_assign( __entry->dev = server->s_dev; __entry->status = status; - __assign_str(dstaddr, server->nfs_client->cl_hostname); + __assign_str(dstaddr); memcpy(__entry->deviceid, deviceid->data, NFS4_DEVICEID4_SIZE); ), @@ -2036,8 +2036,8 @@ TRACE_EVENT(fl_getdevinfo, ), TP_fast_assign( - __assign_str(mds_addr, server->nfs_client->cl_hostname); - __assign_str(ds_ips, ds_remotestr); + __assign_str(mds_addr); + __assign_str(ds_ips); memcpy(__entry->deviceid, deviceid->data, NFS4_DEVICEID4_SIZE); ), @@ -2083,9 +2083,7 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, be32_to_cpu(hdr->args.stateid.seqid); __entry->stateid_hash = nfs_stateid_hash(&hdr->args.stateid); - __assign_str(dstaddr, hdr->ds_clp ? - rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown"); + __assign_str(dstaddr); ), TP_printk( @@ -2139,9 +2137,7 @@ TRACE_EVENT(ff_layout_commit_error, __entry->dev = inode->i_sb->s_dev; __entry->offset = data->args.offset; __entry->count = data->args.count; - __assign_str(dstaddr, data->ds_clp ? - rpc_peeraddr2str(data->ds_clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown"); + __assign_str(dstaddr); ), TP_printk( @@ -2579,7 +2575,7 @@ DECLARE_EVENT_CLASS(nfs4_xattr_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); - __assign_str(name, name); + __assign_str(name); ), TP_printk( diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index afedb449b54f..1e710654af11 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -409,7 +409,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -457,7 +457,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, __entry->error = error < 0 ? -error : 0; __entry->flags = flags; __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -512,7 +512,7 @@ TRACE_EVENT(nfs_atomic_open_enter, __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -551,7 +551,7 @@ TRACE_EVENT(nfs_atomic_open_exit, __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -587,7 +587,7 @@ TRACE_EVENT(nfs_create_enter, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -623,7 +623,7 @@ TRACE_EVENT(nfs_create_exit, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -654,7 +654,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -693,7 +693,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -747,7 +747,7 @@ TRACE_EVENT(nfs_link_enter, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -783,7 +783,7 @@ TRACE_EVENT(nfs_link_exit, __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -819,8 +819,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event, __entry->dev = old_dir->i_sb->s_dev; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); - __assign_str(old_name, old_dentry->d_name.name); - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(old_name); + __assign_str(new_name); ), TP_printk( @@ -868,8 +868,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, __entry->error = -error; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); - __assign_str(old_name, old_dentry->d_name.name); - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(old_name); + __assign_str(new_name); ), TP_printk( @@ -1636,8 +1636,8 @@ TRACE_EVENT(nfs_mount_assign, ), TP_fast_assign( - __assign_str(option, option); - __assign_str(value, value); + __assign_str(option); + __assign_str(value); ), TP_printk("option %s=%s", @@ -1657,7 +1657,7 @@ TRACE_EVENT(nfs_mount_option, ), TP_fast_assign( - __assign_str(option, param->key); + __assign_str(option); ), TP_printk("option %s", __get_str(option)) @@ -1675,7 +1675,7 @@ TRACE_EVENT(nfs_mount_path, ), TP_fast_assign( - __assign_str(path, path); + __assign_str(path); ), TP_printk("path='%s'", __get_str(path)) @@ -1710,9 +1710,8 @@ DECLARE_EVENT_CLASS(nfs_xdr_event, __entry->xid = be32_to_cpu(rqstp->rq_xid); __entry->version = task->tk_client->cl_vers; __entry->error = error; - __assign_str(program, - task->tk_client->cl_program->name); - __assign_str(procedure, task->tk_msg.rpc_proc->p_name); + __assign_str(program); + __assign_str(procedure); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a5cc6199127f..b5834728f31b 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2705,38 +2705,28 @@ pnfs_layout_return_unused_byclid(struct nfs_client *clp, &range); } +/* Check if we have we have a valid layout but if there isn't an intersection + * between the request and the pgio->pg_lseg, put this pgio->pg_lseg away. + */ void -pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) +pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) { if (pgio->pg_lseg == NULL || - test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags)) + (test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags) && + pnfs_lseg_request_intersecting(pgio->pg_lseg, req))) return; pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); -/* - * Check for any intersection between the request and the pgio->pg_lseg, - * and if none, put this pgio->pg_lseg away. - */ -void -pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; - } -} -EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range); - void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { u64 rd_size; - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (pgio->pg_lseg == NULL) { if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); @@ -2766,8 +2756,7 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index db57a85500ee..fa5beeaaf5da 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -257,8 +257,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); -void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio); -void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req); +void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio, struct nfs_page *req); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ad3a321ae997..d105e5b2659d 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -695,6 +695,7 @@ static int nfs_have_delegation(struct inode *inode, fmode_t flags) static const struct inode_operations nfs_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, + .atomic_open = nfs_atomic_open_v23, .link = nfs_link, .unlink = nfs_unlink, .symlink = nfs_symlink, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index dc03f98f7616..cbbd4866b0b7 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -901,6 +901,16 @@ static struct nfs_server *nfs_try_mount_request(struct fs_context *fc) rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS]; unsigned int authlist_len = ARRAY_SIZE(authlist); + /* make sure 'nolock'/'lock' override the 'local_lock' mount option */ + if (ctx->lock_status) { + if (ctx->lock_status == NFS_LOCK_NOLOCK) { + ctx->flags |= NFS_MOUNT_NONLM; + ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); + } else { + ctx->flags &= ~NFS_MOUNT_NONLM; + ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); + } + } status = nfs_request_mount(fc, ctx->mntfh, authlist, &authlist_len); if (status) return ERR_PTR(status); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 7b641095a665..50b3135d07ac 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -334,21 +334,25 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) static int export_stats_init(struct export_stats *stats) { stats->start_time = ktime_get_seconds(); - return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM); + return percpu_counter_init_many(stats->counter, 0, GFP_KERNEL, + EXP_STATS_COUNTERS_NUM); } static void export_stats_reset(struct export_stats *stats) { - if (stats) - nfsd_percpu_counters_reset(stats->counter, - EXP_STATS_COUNTERS_NUM); + if (stats) { + int i; + + for (i = 0; i < EXP_STATS_COUNTERS_NUM; i++) + percpu_counter_set(&stats->counter[i], 0); + } } static void export_stats_destroy(struct export_stats *stats) { if (stats) - nfsd_percpu_counters_destroy(stats->counter, - EXP_STATS_COUNTERS_NUM); + percpu_counter_destroy_many(stats->counter, + EXP_STATS_COUNTERS_NUM); } static void svc_export_put(struct kref *ref) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ddd3e0d9cfa6..ad9083ca144b 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -159,8 +159,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode) do { fsnotify_group_lock(nfsd_file_fsnotify_group); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, - nfsd_file_fsnotify_group); + mark = fsnotify_find_inode_mark(inode, + nfsd_file_fsnotify_group); if (mark) { nfm = nfsd_file_mark_get(container_of(mark, struct nfsd_file_mark, diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index 0e1d635ec5f9..62d2586d9902 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -10,6 +10,36 @@ #include <uapi/linux/nfsd_netlink.h> +/* Common nested types */ +const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1] = { + [NFSD_A_SOCK_ADDR] = { .type = NLA_BINARY, }, + [NFSD_A_SOCK_TRANSPORT_NAME] = { .type = NLA_NUL_STRING, }, +}; + +const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = { + [NFSD_A_VERSION_MAJOR] = { .type = NLA_U32, }, + [NFSD_A_VERSION_MINOR] = { .type = NLA_U32, }, + [NFSD_A_VERSION_ENABLED] = { .type = NLA_FLAG, }, +}; + +/* NFSD_CMD_THREADS_SET - do */ +static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_SCOPE + 1] = { + [NFSD_A_SERVER_THREADS] = { .type = NLA_U32, }, + [NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, }, + [NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, }, + [NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, }, +}; + +/* NFSD_CMD_VERSION_SET - do */ +static const struct nla_policy nfsd_version_set_nl_policy[NFSD_A_SERVER_PROTO_VERSION + 1] = { + [NFSD_A_SERVER_PROTO_VERSION] = NLA_POLICY_NESTED(nfsd_version_nl_policy), +}; + +/* NFSD_CMD_LISTENER_SET - do */ +static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_ADDR + 1] = { + [NFSD_A_SERVER_SOCK_ADDR] = NLA_POLICY_NESTED(nfsd_sock_nl_policy), +}; + /* Ops table for nfsd */ static const struct genl_split_ops nfsd_nl_ops[] = { { @@ -19,6 +49,42 @@ static const struct genl_split_ops nfsd_nl_ops[] = { .done = nfsd_nl_rpc_status_get_done, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = NFSD_CMD_THREADS_SET, + .doit = nfsd_nl_threads_set_doit, + .policy = nfsd_threads_set_nl_policy, + .maxattr = NFSD_A_SERVER_SCOPE, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_THREADS_GET, + .doit = nfsd_nl_threads_get_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_VERSION_SET, + .doit = nfsd_nl_version_set_doit, + .policy = nfsd_version_set_nl_policy, + .maxattr = NFSD_A_SERVER_PROTO_VERSION, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_VERSION_GET, + .doit = nfsd_nl_version_get_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_LISTENER_SET, + .doit = nfsd_nl_listener_set_doit, + .policy = nfsd_listener_set_nl_policy, + .maxattr = NFSD_A_SERVER_SOCK_ADDR, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_LISTENER_GET, + .doit = nfsd_nl_listener_get_doit, + .flags = GENL_CMD_CAP_DO, + }, }; struct genl_family nfsd_nl_family __ro_after_init = { diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h index d83dd6bdee92..e3724637d64d 100644 --- a/fs/nfsd/netlink.h +++ b/fs/nfsd/netlink.h @@ -11,11 +11,21 @@ #include <uapi/linux/nfsd_netlink.h> +/* Common nested types */ +extern const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1]; +extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1]; + int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb); int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb); int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info); extern struct genl_family nfsd_nl_family; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index d4be519b5734..14ec15656320 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -218,6 +218,7 @@ struct nfsd_net { /* Simple check to find out if a given net was properly initialized */ #define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl) +extern bool nfsd_support_version(int vers); extern void nfsd_netns_free_versions(struct nfsd_net *nn); extern unsigned int nfsd_net_id; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index e88aca0c6e8e..d756f443fc44 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -978,12 +978,12 @@ static int max_cb_time(struct net *net) return max(((u32)nn->nfsd4_lease)/10, 1u) * HZ; } -static struct workqueue_struct *callback_wq; - static bool nfsd4_queue_cb(struct nfsd4_callback *cb) { - trace_nfsd_cb_queue(cb->cb_clp, cb); - return queue_work(callback_wq, &cb->cb_work); + struct nfs4_client *clp = cb->cb_clp; + + trace_nfsd_cb_queue(clp, cb); + return queue_work(clp->cl_callback_wq, &cb->cb_work); } static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) @@ -1153,7 +1153,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp) void nfsd4_probe_callback_sync(struct nfs4_client *clp) { nfsd4_probe_callback(clp); - flush_workqueue(callback_wq); + flush_workqueue(clp->cl_callback_wq); } void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) @@ -1372,19 +1372,6 @@ static const struct rpc_call_ops nfsd4_cb_ops = { .rpc_release = nfsd4_cb_release, }; -int nfsd4_create_callback_queue(void) -{ - callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0); - if (!callback_wq) - return -ENOMEM; - return 0; -} - -void nfsd4_destroy_callback_queue(void) -{ - destroy_workqueue(callback_wq); -} - /* must be called under the state lock */ void nfsd4_shutdown_callback(struct nfs4_client *clp) { @@ -1398,7 +1385,7 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) * client, destroy the rpc client, and stop: */ nfsd4_run_cb(&clp->cl_cb_null); - flush_workqueue(callback_wq); + flush_workqueue(clp->cl_callback_wq); nfsd41_cb_inflight_wait_complete(clp); } @@ -1420,9 +1407,9 @@ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) /* * Note there isn't a lot of locking in this code; instead we depend on - * the fact that it is run from the callback_wq, which won't run two - * work items at once. So, for example, callback_wq handles all access - * of cl_cb_client and all calls to rpc_create or rpc_shutdown_client. + * the fact that it is run from clp->cl_callback_wq, which won't run two + * work items at once. So, for example, clp->cl_callback_wq handles all + * access of cl_cb_client and all calls to rpc_create or rpc_shutdown_client. */ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2927b1263f08..46bd20fe5c0f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1737,7 +1737,7 @@ static void cleanup_async_copy(struct nfsd4_copy *copy) nfs4_put_copy(copy); } -static void nfsd4_send_cb_offload(struct nfsd4_copy *copy, __be32 nfserr) +static void nfsd4_send_cb_offload(struct nfsd4_copy *copy) { struct nfsd4_cb_offload *cbo; @@ -1747,12 +1747,12 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy, __be32 nfserr) memcpy(&cbo->co_res, ©->cp_res, sizeof(copy->cp_res)); memcpy(&cbo->co_fh, ©->fh, sizeof(copy->fh)); - cbo->co_nfserr = nfserr; + cbo->co_nfserr = copy->nfserr; nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid, - &cbo->co_fh, copy->cp_count, nfserr); + &cbo->co_fh, copy->cp_count, copy->nfserr); nfsd4_run_cb(&cbo->co_cb); } @@ -1766,7 +1766,6 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy, __be32 nfserr) static int nfsd4_do_async_copy(void *data) { struct nfsd4_copy *copy = (struct nfsd4_copy *)data; - __be32 nfserr; trace_nfsd_copy_do_async(copy); if (nfsd4_ssc_is_inter(copy)) { @@ -1777,24 +1776,25 @@ static int nfsd4_do_async_copy(void *data) if (IS_ERR(filp)) { switch (PTR_ERR(filp)) { case -EBADF: - nfserr = nfserr_wrong_type; + copy->nfserr = nfserr_wrong_type; break; default: - nfserr = nfserr_offload_denied; + copy->nfserr = nfserr_offload_denied; } /* ss_mnt will be unmounted by the laundromat */ goto do_callback; } - nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file, - false); + copy->nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file, + false); nfsd4_cleanup_inter_ssc(copy->ss_nsui, filp, copy->nf_dst); } else { - nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file, - copy->nf_dst->nf_file, false); + copy->nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file, + copy->nf_dst->nf_file, false); } do_callback: - nfsd4_send_cb_offload(copy, nfserr); + set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); + nfsd4_send_cb_offload(copy); cleanup_async_copy(copy); return 0; } @@ -1807,6 +1807,13 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd4_copy *async_copy = NULL; + /* + * Currently, async COPY is not reliable. Force all COPY + * requests to be synchronous to avoid client application + * hangs waiting for COPY completion. + */ + nfsd4_copy_set_sync(copy, true); + copy->cp_clp = cstate->clp; if (nfsd4_ssc_is_inter(copy)) { trace_nfsd_copy_inter(copy); @@ -2003,11 +2010,16 @@ nfsd4_offload_status(struct svc_rqst *rqstp, struct nfsd4_copy *copy; struct nfs4_client *clp = cstate->clp; + os->completed = false; spin_lock(&clp->async_lock); copy = find_async_copy_locked(clp, &os->stateid); - if (copy) + if (copy) { os->count = copy->cp_res.wr_bytes_written; - else + if (test_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags)) { + os->completed = true; + os->status = copy->nfserr; + } + } else status = nfserr_bad_stateid; spin_unlock(&clp->async_lock); @@ -2154,6 +2166,29 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status == nfserr_same ? nfs_ok : status; } +static __be32 +nfsd4_get_dir_delegation(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation; + + /* + * RFC 8881, section 18.39.3 says: + * + * "The server may refuse to grant the delegation. In that case, the + * server will return NFS4ERR_DIRDELEG_UNAVAIL." + * + * This is sub-optimal, since it means that the server would need to + * abort compound processing just because the delegation wasn't + * available. RFC8881bis should change this to allow the server to + * return NFS4_OK with a non-fatal status of GDD4_UNAVAIL in this + * situation. + */ + gdd->gddrnf_status = GDD4_UNAVAIL; + return nfs_ok; +} + #ifdef CONFIG_NFSD_PNFS static const struct nfsd4_layout_ops * nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) @@ -3082,6 +3117,18 @@ static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp, * sizeof(__be32); } +static u32 nfsd4_get_dir_delegation_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* gddr_status */ + + op_encode_verifier_maxsz + + op_encode_stateid_maxsz + + 2 /* gddr_notification */ + + 2 /* gddr_child_attributes */ + + 2 /* gddr_dir_attributes */); +} + #ifdef CONFIG_NFSD_PNFS static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp, const struct nfsd4_op *op) @@ -3470,6 +3517,12 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_get_currentstateid = nfsd4_get_freestateid, .op_rsize_bop = nfsd4_only_status_rsize, }, + [OP_GET_DIR_DELEGATION] = { + .op_func = nfsd4_get_dir_delegation, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_GET_DIR_DELEGATION", + .op_rsize_bop = nfsd4_get_dir_delegation_rsize, + }, #ifdef CONFIG_NFSD_PNFS [OP_GETDEVICEINFO] = { .op_func = nfsd4_getdeviceinfo, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 84d4093ca713..a20c2c9d7d45 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -541,7 +541,7 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner) } static struct nfs4_openowner * -find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open, +find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, struct nfs4_client *clp) { struct nfs4_stateowner *so; @@ -558,18 +558,6 @@ find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open, return NULL; } -static struct nfs4_openowner * -find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, - struct nfs4_client *clp) -{ - struct nfs4_openowner *oo; - - spin_lock(&clp->cl_lock); - oo = find_openstateowner_str_locked(hashval, open, clp); - spin_unlock(&clp->cl_lock); - return oo; -} - static inline u32 opaque_hashval(const void *ptr, int nbytes) { @@ -1409,11 +1397,16 @@ static void recalculate_deny_mode(struct nfs4_file *fp) { struct nfs4_ol_stateid *stp; + u32 old_deny; spin_lock(&fp->fi_lock); + old_deny = fp->fi_share_deny; fp->fi_share_deny = 0; - list_for_each_entry(stp, &fp->fi_stateids, st_perfile) + list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap); + if (fp->fi_share_deny == old_deny) + break; + } spin_unlock(&fp->fi_lock); } @@ -2245,6 +2238,10 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, GFP_KERNEL); if (!clp->cl_ownerstr_hashtbl) goto err_no_hashtbl; + clp->cl_callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0); + if (!clp->cl_callback_wq) + goto err_no_callback_wq; + for (i = 0; i < OWNER_HASH_SIZE; i++) INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]); INIT_LIST_HEAD(&clp->cl_sessions); @@ -2267,6 +2264,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; +err_no_callback_wq: + kfree(clp->cl_ownerstr_hashtbl); err_no_hashtbl: kfree(clp->cl_name.data); err_no_name: @@ -2280,6 +2279,7 @@ static void __free_client(struct kref *k) struct nfs4_client *clp = container_of(c, struct nfs4_client, cl_nfsdfs); free_svc_cred(&clp->cl_cred); + destroy_workqueue(clp->cl_callback_wq); kfree(clp->cl_ownerstr_hashtbl); kfree(clp->cl_name.data); kfree(clp->cl_nii_domain.data); @@ -2352,7 +2352,11 @@ unhash_client(struct nfs4_client *clp) static __be32 mark_client_expired_locked(struct nfs4_client *clp) { - if (atomic_read(&clp->cl_rpc_users)) + int users = atomic_read(&clp->cl_rpc_users); + + trace_nfsd_mark_client_expired(clp, users); + + if (users) return nfserr_jukebox; unhash_client_locked(clp); return nfs_ok; @@ -3641,12 +3645,8 @@ out_nolock: return status; } -static __be32 -check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) +static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, bool slot_inuse) { - dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid, - slot_seqid); - /* The slot is in use, and no response has been sent. */ if (slot_inuse) { if (seqid == slot_seqid) @@ -3823,10 +3823,13 @@ nfsd4_create_session(struct svc_rqst *rqstp, } /* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */ - if (conf) + if (conf) { cs_slot = &conf->cl_cs_slot; - else + trace_nfsd_slot_seqid_conf(conf, cr_ses); + } else { cs_slot = &unconf->cl_cs_slot; + trace_nfsd_slot_seqid_unconf(unconf, cr_ses); + } status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); switch (status) { case nfs_ok: @@ -4221,6 +4224,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * sr_highest_slotid and the sr_target_slot id to maxslots */ seq->maxslots = session->se_fchannel.maxreqs; + trace_nfsd_slot_seqid_sequence(clp, seq, slot); status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags & NFSD4_SLOT_INUSE); if (status == nfserr_replay_cache) { @@ -4662,21 +4666,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn) atomic_set(&nn->nfsd_courtesy_clients, 0); } +enum rp_lock { + RP_UNLOCKED, + RP_LOCKED, + RP_UNHASHED, +}; + static void init_nfs4_replay(struct nfs4_replay *rp) { rp->rp_status = nfserr_serverfault; rp->rp_buflen = 0; rp->rp_buf = rp->rp_ibuf; - mutex_init(&rp->rp_mutex); + atomic_set(&rp->rp_locked, RP_UNLOCKED); } -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, - struct nfs4_stateowner *so) +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, + struct nfs4_stateowner *so) { if (!nfsd4_has_session(cstate)) { - mutex_lock(&so->so_replay.rp_mutex); + wait_var_event(&so->so_replay.rp_locked, + atomic_cmpxchg(&so->so_replay.rp_locked, + RP_UNLOCKED, RP_LOCKED) != RP_LOCKED); + if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED) + return -EAGAIN; cstate->replay_owner = nfs4_get_stateowner(so); } + return 0; } void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) @@ -4685,7 +4700,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) if (so != NULL) { cstate->replay_owner = NULL; - mutex_unlock(&so->so_replay.rp_mutex); + atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED); + wake_up_var(&so->so_replay.rp_locked); nfs4_put_stateowner(so); } } @@ -4866,34 +4882,46 @@ nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) } static struct nfs4_openowner * -alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, - struct nfsd4_compound_state *cstate) +find_or_alloc_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, + struct nfsd4_compound_state *cstate) { struct nfs4_client *clp = cstate->clp; - struct nfs4_openowner *oo, *ret; + struct nfs4_openowner *oo, *new = NULL; - oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); - if (!oo) - return NULL; - oo->oo_owner.so_ops = &openowner_ops; - oo->oo_owner.so_is_open_owner = 1; - oo->oo_owner.so_seqid = open->op_seqid; - oo->oo_flags = 0; - if (nfsd4_has_session(cstate)) - oo->oo_flags |= NFS4_OO_CONFIRMED; - oo->oo_time = 0; - oo->oo_last_closed_stid = NULL; - INIT_LIST_HEAD(&oo->oo_close_lru); +retry: spin_lock(&clp->cl_lock); - ret = find_openstateowner_str_locked(strhashval, open, clp); - if (ret == NULL) { - hash_openowner(oo, clp, strhashval); - ret = oo; - } else - nfs4_free_stateowner(&oo->oo_owner); - + oo = find_openstateowner_str(strhashval, open, clp); + if (!oo && new) { + hash_openowner(new, clp, strhashval); + spin_unlock(&clp->cl_lock); + return new; + } spin_unlock(&clp->cl_lock); - return ret; + + if (oo && !(oo->oo_flags & NFS4_OO_CONFIRMED)) { + /* Replace unconfirmed owners without checking for replay. */ + release_openowner(oo); + oo = NULL; + } + if (oo) { + if (new) + nfs4_free_stateowner(&new->oo_owner); + return oo; + } + + new = alloc_stateowner(openowner_slab, &open->op_owner, clp); + if (!new) + return NULL; + new->oo_owner.so_ops = &openowner_ops; + new->oo_owner.so_is_open_owner = 1; + new->oo_owner.so_seqid = open->op_seqid; + new->oo_flags = 0; + if (nfsd4_has_session(cstate)) + new->oo_flags |= NFS4_OO_CONFIRMED; + new->oo_time = 0; + new->oo_last_closed_stid = NULL; + INIT_LIST_HEAD(&new->oo_close_lru); + goto retry; } static struct nfs4_ol_stateid * @@ -4969,7 +4997,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) * Wait for the refcount to drop to 2. Since it has been unhashed, * there should be no danger of the refcount going back up again at * this point. + * Some threads with a reference might be waiting for rp_locked, + * so tell them to stop waiting. */ + atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED); + wake_up_var(&oo->oo_owner.so_replay.rp_locked); wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2); release_all_access(s); @@ -5342,27 +5374,19 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, clp = cstate->clp; strhashval = ownerstr_hashval(&open->op_owner); - oo = find_openstateowner_str(strhashval, open, clp); +retry: + oo = find_or_alloc_open_stateowner(strhashval, open, cstate); open->op_openowner = oo; - if (!oo) { - goto new_owner; - } - if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { - /* Replace unconfirmed owners without checking for replay. */ - release_openowner(oo); - open->op_openowner = NULL; - goto new_owner; + if (!oo) + return nfserr_jukebox; + if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) { + nfs4_put_stateowner(&oo->oo_owner); + goto retry; } status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); if (status) return status; - goto alloc_stateid; -new_owner: - oo = alloc_init_open_stateowner(strhashval, open, cstate); - if (oo == NULL) - return nfserr_jukebox; - open->op_openowner = oo; -alloc_stateid: + open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; @@ -6133,12 +6157,8 @@ out: void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { - if (open->op_openowner) { - struct nfs4_stateowner *so = &open->op_openowner->oo_owner; - - nfsd4_cstate_assign_replay(cstate, so); - nfs4_put_stateowner(so); - } + if (open->op_openowner) + nfs4_put_stateowner(&open->op_openowner->oo_owner); if (open->op_file) kmem_cache_free(file_slab, open->op_file); if (open->op_stp) @@ -7202,12 +7222,16 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, trace_nfsd_preprocess(seqid, stateid); *stpp = NULL; +retry: status = nfsd4_lookup_stateid(cstate, stateid, typemask, statusmask, &s, nn); if (status) return status; stp = openlockstateid(s); - nfsd4_cstate_assign_replay(cstate, stp->st_stateowner); + if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) { + nfs4_put_stateowner(stp->st_stateowner); + goto retry; + } status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp); if (!status) @@ -7349,7 +7373,7 @@ out: return status; } -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { struct nfs4_client *clp = s->st_stid.sc_client; bool unhashed; @@ -7366,11 +7390,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) list_for_each_entry(stp, &reaplist, st_locks) nfs4_free_cpntf_statelist(clp->net, &stp->st_stid); free_ol_stateid_reaplist(&reaplist); + return false; } else { spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); - if (unhashed) - move_to_close_lru(s, clp->net); + return unhashed; } } @@ -7386,6 +7410,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *stp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); + bool need_move_to_close_list; dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); @@ -7410,8 +7435,10 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - nfsd4_close_open_stateid(stp); + need_move_to_close_list = nfsd4_close_open_stateid(stp); mutex_unlock(&stp->st_mutex); + if (need_move_to_close_list) + move_to_close_lru(stp, net); /* v4.1+ suggests that we send a special stateid in here, since the * clients should just ignore this anyway. Since this is not useful @@ -8625,12 +8652,6 @@ nfs4_state_start(void) if (ret) return ret; - ret = nfsd4_create_callback_queue(); - if (ret) { - rhltable_destroy(&nfs4_file_rhltable); - return ret; - } - set_max_delegations(); return 0; } @@ -8671,7 +8692,6 @@ nfs4_state_shutdown_net(struct net *net) void nfs4_state_shutdown(void) { - nfsd4_destroy_callback_queue(); rhltable_destroy(&nfs4_file_rhltable); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index a644460f3a5e..c7bfd2180e3f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1732,6 +1732,35 @@ nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid); } +static __be32 +nfsd4_decode_get_dir_delegation(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation; + __be32 status; + + memset(gdd, 0, sizeof(*gdd)); + + if (xdr_stream_decode_bool(argp->xdr, &gdd->gdda_signal_deleg_avail) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_bitmap4(argp, gdd->gdda_notification_types, + ARRAY_SIZE(gdd->gdda_notification_types)); + if (status) + return status; + status = nfsd4_decode_nfstime4(argp, &gdd->gdda_child_attr_delay); + if (status) + return status; + status = nfsd4_decode_nfstime4(argp, &gdd->gdda_dir_attr_delay); + if (status) + return status; + status = nfsd4_decode_bitmap4(argp, gdd->gdda_child_attributes, + ARRAY_SIZE(gdd->gdda_child_attributes)); + if (status) + return status; + return nfsd4_decode_bitmap4(argp, gdd->gdda_dir_attributes, + ARRAY_SIZE(gdd->gdda_dir_attributes)); +} + #ifdef CONFIG_NFSD_PNFS static __be32 nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, @@ -2370,7 +2399,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = { [OP_CREATE_SESSION] = nfsd4_decode_create_session, [OP_DESTROY_SESSION] = nfsd4_decode_destroy_session, [OP_FREE_STATEID] = nfsd4_decode_free_stateid, - [OP_GET_DIR_DELEGATION] = nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = nfsd4_decode_get_dir_delegation, #ifdef CONFIG_NFSD_PNFS [OP_GETDEVICEINFO] = nfsd4_decode_getdeviceinfo, [OP_GETDEVICELIST] = nfsd4_decode_notsupp, @@ -4963,6 +4992,49 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, return nfs_ok; } +static __be32 +nfsd4_encode_get_dir_delegation(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation; + struct xdr_stream *xdr = resp->xdr; + __be32 status = nfserr_resource; + + switch(gdd->gddrnf_status) { + case GDD4_OK: + if (xdr_stream_encode_u32(xdr, GDD4_OK) != XDR_UNIT) + break; + status = nfsd4_encode_verifier4(xdr, &gdd->gddr_cookieverf); + if (status) + break; + status = nfsd4_encode_stateid4(xdr, &gdd->gddr_stateid); + if (status) + break; + status = nfsd4_encode_bitmap4(xdr, gdd->gddr_notification[0], 0, 0); + if (status) + break; + status = nfsd4_encode_bitmap4(xdr, gdd->gddr_child_attributes[0], + gdd->gddr_child_attributes[1], + gdd->gddr_child_attributes[2]); + if (status) + break; + status = nfsd4_encode_bitmap4(xdr, gdd->gddr_dir_attributes[0], + gdd->gddr_dir_attributes[1], + gdd->gddr_dir_attributes[2]); + break; + default: + pr_warn("nfsd: bad gddrnf_status (%u)\n", gdd->gddrnf_status); + gdd->gddrnf_will_signal_deleg_avail = 0; + fallthrough; + case GDD4_UNAVAIL: + if (xdr_stream_encode_u32(xdr, GDD4_UNAVAIL) != XDR_UNIT) + break; + status = nfsd4_encode_bool(xdr, gdd->gddrnf_will_signal_deleg_avail); + break; + } + return status; +} + #ifdef CONFIG_NFSD_PNFS static __be32 nfsd4_encode_device_addr4(struct xdr_stream *xdr, @@ -5199,7 +5271,12 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, if (nfserr != nfs_ok) return nfserr; /* osr_complete<1> */ - if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + if (os->completed) { + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) + return nfserr_resource; + if (xdr_stream_encode_be32(xdr, os->status) != XDR_UNIT) + return nfserr_resource; + } else if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) return nfserr_resource; return nfs_ok; } @@ -5579,7 +5656,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = { [OP_CREATE_SESSION] = nfsd4_encode_create_session, [OP_DESTROY_SESSION] = nfsd4_encode_noop, [OP_FREE_STATEID] = nfsd4_encode_noop, - [OP_GET_DIR_DELEGATION] = nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = nfsd4_encode_get_dir_delegation, #ifdef CONFIG_NFSD_PNFS [OP_GETDEVICEINFO] = nfsd4_encode_getdeviceinfo, [OP_GETDEVICELIST] = nfsd4_encode_noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ecd18bffeebc..202140df8f82 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -15,6 +15,7 @@ #include <linux/sunrpc/addr.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/sunrpc/svc.h> #include <linux/module.h> #include <linux/fsnotify.h> @@ -48,12 +49,10 @@ enum { NFSD_MaxBlkSize, NFSD_MaxConnections, NFSD_Filecache, -#ifdef CONFIG_NFSD_V4 NFSD_Leasetime, NFSD_Gracetime, NFSD_RecoveryDir, NFSD_V4EndGrace, -#endif NFSD_MaxReserved }; @@ -406,7 +405,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) if (newthreads < 0) return -EINVAL; trace_nfsd_ctl_threads(net, newthreads); - rv = nfsd_svc(newthreads, net, file->f_cred); + mutex_lock(&nfsd_mutex); + rv = nfsd_svc(newthreads, net, file->f_cred, NULL); + mutex_unlock(&nfsd_mutex); if (rv < 0) return rv; } else @@ -1360,7 +1361,9 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, +#endif [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO}, #endif /* last one */ {""} @@ -1652,6 +1655,518 @@ int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb) } /** + * nfsd_nl_threads_set_doit - set the number of running threads + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + int nthreads = 0, count = 0, nrpools, ret = -EOPNOTSUPP, rem; + struct net *net = genl_info_net(info); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + const struct nlattr *attr; + const char *scope = NULL; + + if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_THREADS)) + return -EINVAL; + + /* count number of SERVER_THREADS values */ + nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + if (nla_type(attr) == NFSD_A_SERVER_THREADS) + count++; + } + + mutex_lock(&nfsd_mutex); + + nrpools = nfsd_nrpools(net); + if (nrpools && count > nrpools) + count = nrpools; + + /* XXX: make this handle non-global pool-modes */ + if (count > 1) + goto out_unlock; + + nthreads = nla_get_u32(info->attrs[NFSD_A_SERVER_THREADS]); + if (info->attrs[NFSD_A_SERVER_GRACETIME] || + info->attrs[NFSD_A_SERVER_LEASETIME] || + info->attrs[NFSD_A_SERVER_SCOPE]) { + ret = -EBUSY; + if (nn->nfsd_serv && nn->nfsd_serv->sv_nrthreads) + goto out_unlock; + + ret = -EINVAL; + attr = info->attrs[NFSD_A_SERVER_GRACETIME]; + if (attr) { + u32 gracetime = nla_get_u32(attr); + + if (gracetime < 10 || gracetime > 3600) + goto out_unlock; + + nn->nfsd4_grace = gracetime; + } + + attr = info->attrs[NFSD_A_SERVER_LEASETIME]; + if (attr) { + u32 leasetime = nla_get_u32(attr); + + if (leasetime < 10 || leasetime > 3600) + goto out_unlock; + + nn->nfsd4_lease = leasetime; + } + + attr = info->attrs[NFSD_A_SERVER_SCOPE]; + if (attr) + scope = nla_data(attr); + } + + ret = nfsd_svc(nthreads, net, get_current_cred(), scope); + +out_unlock: + mutex_unlock(&nfsd_mutex); + + return ret == nthreads ? 0 : ret; +} + +/** + * nfsd_nl_threads_get_doit - get the number of running threads + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + void *hdr; + int err; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_iput(skb, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_free_msg; + } + + mutex_lock(&nfsd_mutex); + + err = nla_put_u32(skb, NFSD_A_SERVER_GRACETIME, + nn->nfsd4_grace) || + nla_put_u32(skb, NFSD_A_SERVER_LEASETIME, + nn->nfsd4_lease) || + nla_put_string(skb, NFSD_A_SERVER_SCOPE, + nn->nfsd_name); + if (err) + goto err_unlock; + + if (nn->nfsd_serv) { + int i; + + for (i = 0; i < nfsd_nrpools(net); ++i) { + struct svc_pool *sp = &nn->nfsd_serv->sv_pools[i]; + + err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, + atomic_read(&sp->sp_nrthreads)); + if (err) + goto err_unlock; + } + } else { + err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, 0); + if (err) + goto err_unlock; + } + + mutex_unlock(&nfsd_mutex); + + genlmsg_end(skb, hdr); + + return genlmsg_reply(skb, info); + +err_unlock: + mutex_unlock(&nfsd_mutex); +err_free_msg: + nlmsg_free(skb); + + return err; +} + +/** + * nfsd_nl_version_set_doit - set the nfs enabled versions + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + const struct nlattr *attr; + struct nfsd_net *nn; + int i, rem; + + if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_PROTO_VERSION)) + return -EINVAL; + + mutex_lock(&nfsd_mutex); + + nn = net_generic(genl_info_net(info), nfsd_net_id); + if (nn->nfsd_serv) { + mutex_unlock(&nfsd_mutex); + return -EBUSY; + } + + /* clear current supported versions. */ + nfsd_vers(nn, 2, NFSD_CLEAR); + nfsd_vers(nn, 3, NFSD_CLEAR); + for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) + nfsd_minorversion(nn, i, NFSD_CLEAR); + + nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + struct nlattr *tb[NFSD_A_VERSION_MAX + 1]; + u32 major, minor = 0; + bool enabled; + + if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION) + continue; + + if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr, + nfsd_version_nl_policy, info->extack) < 0) + continue; + + if (!tb[NFSD_A_VERSION_MAJOR]) + continue; + + major = nla_get_u32(tb[NFSD_A_VERSION_MAJOR]); + if (tb[NFSD_A_VERSION_MINOR]) + minor = nla_get_u32(tb[NFSD_A_VERSION_MINOR]); + + enabled = nla_get_flag(tb[NFSD_A_VERSION_ENABLED]); + + switch (major) { + case 4: + nfsd_minorversion(nn, minor, enabled ? NFSD_SET : NFSD_CLEAR); + break; + case 3: + case 2: + if (!minor) + nfsd_vers(nn, major, enabled ? NFSD_SET : NFSD_CLEAR); + break; + default: + break; + } + } + + mutex_unlock(&nfsd_mutex); + + return 0; +} + +/** + * nfsd_nl_version_get_doit - get the enabled status for all supported nfs versions + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nfsd_net *nn; + int i, err; + void *hdr; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_iput(skb, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_free_msg; + } + + mutex_lock(&nfsd_mutex); + nn = net_generic(genl_info_net(info), nfsd_net_id); + + for (i = 2; i <= 4; i++) { + int j; + + for (j = 0; j <= NFSD_SUPPORTED_MINOR_VERSION; j++) { + struct nlattr *attr; + + /* Don't record any versions the kernel doesn't have + * compiled in + */ + if (!nfsd_support_version(i)) + continue; + + /* NFSv{2,3} does not support minor numbers */ + if (i < 4 && j) + continue; + + attr = nla_nest_start(skb, + NFSD_A_SERVER_PROTO_VERSION); + if (!attr) { + err = -EINVAL; + goto err_nfsd_unlock; + } + + if (nla_put_u32(skb, NFSD_A_VERSION_MAJOR, i) || + nla_put_u32(skb, NFSD_A_VERSION_MINOR, j)) { + err = -EINVAL; + goto err_nfsd_unlock; + } + + /* Set the enabled flag if the version is enabled */ + if (nfsd_vers(nn, i, NFSD_TEST) && + (i < 4 || nfsd_minorversion(nn, j, NFSD_TEST)) && + nla_put_flag(skb, NFSD_A_VERSION_ENABLED)) { + err = -EINVAL; + goto err_nfsd_unlock; + } + + nla_nest_end(skb, attr); + } + } + + mutex_unlock(&nfsd_mutex); + genlmsg_end(skb, hdr); + + return genlmsg_reply(skb, info); + +err_nfsd_unlock: + mutex_unlock(&nfsd_mutex); +err_free_msg: + nlmsg_free(skb); + + return err; +} + +/** + * nfsd_nl_listener_set_doit - set the nfs running sockets + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct svc_xprt *xprt, *tmp; + const struct nlattr *attr; + struct svc_serv *serv; + LIST_HEAD(permsocks); + struct nfsd_net *nn; + int err, rem; + + mutex_lock(&nfsd_mutex); + + err = nfsd_create_serv(net); + if (err) { + mutex_unlock(&nfsd_mutex); + return err; + } + + nn = net_generic(net, nfsd_net_id); + serv = nn->nfsd_serv; + + spin_lock_bh(&serv->sv_lock); + + /* Move all of the old listener sockets to a temp list */ + list_splice_init(&serv->sv_permsocks, &permsocks); + + /* + * Walk the list of server_socks from userland and move any that match + * back to sv_permsocks + */ + nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; + const char *xcl_name; + struct sockaddr *sa; + + if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) + continue; + + if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, + nfsd_sock_nl_policy, info->extack) < 0) + continue; + + if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME]) + continue; + + if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa)) + continue; + + xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]); + sa = nla_data(tb[NFSD_A_SOCK_ADDR]); + + /* Put back any matching sockets */ + list_for_each_entry_safe(xprt, tmp, &permsocks, xpt_list) { + /* This shouldn't be possible */ + if (WARN_ON_ONCE(xprt->xpt_net != net)) { + list_move(&xprt->xpt_list, &serv->sv_permsocks); + continue; + } + + /* If everything matches, put it back */ + if (!strcmp(xprt->xpt_class->xcl_name, xcl_name) && + rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local)) { + list_move(&xprt->xpt_list, &serv->sv_permsocks); + break; + } + } + } + + /* For now, no removing old sockets while server is running */ + if (serv->sv_nrthreads && !list_empty(&permsocks)) { + list_splice_init(&permsocks, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + err = -EBUSY; + goto out_unlock_mtx; + } + + /* Close the remaining sockets on the permsocks list */ + while (!list_empty(&permsocks)) { + xprt = list_first_entry(&permsocks, struct svc_xprt, xpt_list); + list_move(&xprt->xpt_list, &serv->sv_permsocks); + + /* + * Newly-created sockets are born with the BUSY bit set. Clear + * it if there are no threads, since nothing can pick it up + * in that case. + */ + if (!serv->sv_nrthreads) + clear_bit(XPT_BUSY, &xprt->xpt_flags); + + set_bit(XPT_CLOSE, &xprt->xpt_flags); + spin_unlock_bh(&serv->sv_lock); + svc_xprt_close(xprt); + spin_lock_bh(&serv->sv_lock); + } + + spin_unlock_bh(&serv->sv_lock); + + /* walk list of addrs again, open any that still don't exist */ + nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; + const char *xcl_name; + struct sockaddr *sa; + int ret; + + if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) + continue; + + if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, + nfsd_sock_nl_policy, info->extack) < 0) + continue; + + if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME]) + continue; + + if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa)) + continue; + + xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]); + sa = nla_data(tb[NFSD_A_SOCK_ADDR]); + + xprt = svc_find_listener(serv, xcl_name, net, sa); + if (xprt) { + svc_xprt_put(xprt); + continue; + } + + ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, + SVC_SOCK_ANONYMOUS, + get_current_cred()); + /* always save the latest error */ + if (ret < 0) + err = ret; + } + + if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) + nfsd_destroy_serv(net); + +out_unlock_mtx: + mutex_unlock(&nfsd_mutex); + + return err; +} + +/** + * nfsd_nl_listener_get_doit - get the nfs running listeners + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct svc_xprt *xprt; + struct svc_serv *serv; + struct nfsd_net *nn; + void *hdr; + int err; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_iput(skb, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_free_msg; + } + + mutex_lock(&nfsd_mutex); + nn = net_generic(genl_info_net(info), nfsd_net_id); + + /* no nfs server? Just send empty socket list */ + if (!nn->nfsd_serv) + goto out_unlock_mtx; + + serv = nn->nfsd_serv; + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + struct nlattr *attr; + + attr = nla_nest_start(skb, NFSD_A_SERVER_SOCK_ADDR); + if (!attr) { + err = -EINVAL; + goto err_serv_unlock; + } + + if (nla_put_string(skb, NFSD_A_SOCK_TRANSPORT_NAME, + xprt->xpt_class->xcl_name) || + nla_put(skb, NFSD_A_SOCK_ADDR, + sizeof(struct sockaddr_storage), + &xprt->xpt_local)) { + err = -EINVAL; + goto err_serv_unlock; + } + + nla_nest_end(skb, attr); + } + spin_unlock_bh(&serv->sv_lock); +out_unlock_mtx: + mutex_unlock(&nfsd_mutex); + genlmsg_end(skb, hdr); + + return genlmsg_reply(skb, info); + +err_serv_unlock: + spin_unlock_bh(&serv->sv_lock); + mutex_unlock(&nfsd_mutex); +err_free_msg: + nlmsg_free(skb); + + return err; +} + +/** * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace * @net: a freshly-created network namespace * @@ -1672,7 +2187,8 @@ static __net_init int nfsd_net_init(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - retval = nfsd_stat_counters_init(nn); + retval = percpu_counter_init_many(nn->counter, 0, GFP_KERNEL, + NFSD_STATS_COUNTERS_NUM); if (retval) goto out_repcache_error; memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); @@ -1704,7 +2220,7 @@ static __net_exit void nfsd_net_exit(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfsd_proc_stat_shutdown(net); - nfsd_stat_counters_destroy(nn); + percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(nn); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 16c5a05f340e..8f4f239d9f8a 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -103,7 +103,7 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, /* * Function prototypes. */ -int nfsd_svc(int nrservs, struct net *net, const struct cred *cred); +int nfsd_svc(int nrservs, struct net *net, const struct cred *cred, const char *scope); int nfsd_dispatch(struct svc_rqst *rqstp); int nfsd_nrthreads(struct net *); @@ -230,7 +230,6 @@ void nfsd_lockd_shutdown(void); #define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) #define nfserr_rofs cpu_to_be32(NFSERR_ROFS) #define nfserr_mlink cpu_to_be32(NFSERR_MLINK) -#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) #define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) #define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) #define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 40fecf7b224f..0b75305fb5f5 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -573,7 +573,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, _fh_update(fhp, exp, dentry); if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) { fh_put(fhp); - return nfserr_opnotsupp; + return nfserr_stale; } return 0; @@ -599,7 +599,7 @@ fh_update(struct svc_fh *fhp) _fh_update(fhp, fhp->fh_export, dentry); if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) - return nfserr_opnotsupp; + return nfserr_stale; return 0; out_bad: printk(KERN_ERR "fh_update: fh not verified!\n"); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index c0d17b92b249..cd9a6a1a9fc8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -133,8 +133,7 @@ struct svc_program nfsd_program = { .pg_rpcbind_set = nfsd_rpcbind_set, }; -static bool -nfsd_support_version(int vers) +bool nfsd_support_version(int vers) { if (vers >= NFSD_MINVERS && vers < NFSD_NRVERS) return nfsd_version[vers] != NULL; @@ -769,13 +768,14 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) * this is the first time nrservs is nonzero. */ int -nfsd_svc(int nrservs, struct net *net, const struct cred *cred) +nfsd_svc(int nrservs, struct net *net, const struct cred *cred, const char *scope) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; - mutex_lock(&nfsd_mutex); + lockdep_assert_held(&nfsd_mutex); + dprintk("nfsd: creating service\n"); nrservs = max(nrservs, 0); @@ -785,7 +785,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) if (nrservs == 0 && nn->nfsd_serv == NULL) goto out; - strscpy(nn->nfsd_name, utsname()->nodename, + strscpy(nn->nfsd_name, scope ? scope : utsname()->nodename, sizeof(nn->nfsd_name)); error = nfsd_create_serv(net); @@ -804,7 +804,6 @@ out_put: if (serv->sv_nrthreads == 0) nfsd_destroy_serv(net); out: - mutex_unlock(&nfsd_mutex); return error; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 2ed0fcf879fd..ffc217099d19 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -408,6 +408,8 @@ struct nfs4_client { 1 << NFSD4_CLIENT_CB_KILL) #define NFSD4_CLIENT_CB_RECALL_ANY (6) unsigned long cl_flags; + + struct workqueue_struct *cl_callback_wq; const struct cred *cl_cb_cred; struct rpc_clnt *cl_cb_client; u32 cl_cb_ident; @@ -486,7 +488,7 @@ struct nfs4_replay { unsigned int rp_buflen; char *rp_buf; struct knfsd_fh rp_openfh; - struct mutex rp_mutex; + atomic_t rp_locked; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; @@ -735,8 +737,6 @@ extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn * extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); extern bool nfsd4_run_cb(struct nfsd4_callback *cb); -extern int nfsd4_create_callback_queue(void); -extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index be52fb1e928e..bb22893f1157 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -73,48 +73,6 @@ static int nfsd_show(struct seq_file *seq, void *v) DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); -int nfsd_percpu_counters_init(struct percpu_counter *counters, int num) -{ - int i, err = 0; - - for (i = 0; !err && i < num; i++) - err = percpu_counter_init(&counters[i], 0, GFP_KERNEL); - - if (!err) - return 0; - - for (; i > 0; i--) - percpu_counter_destroy(&counters[i-1]); - - return err; -} - -void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num) -{ - int i; - - for (i = 0; i < num; i++) - percpu_counter_set(&counters[i], 0); -} - -void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) -{ - int i; - - for (i = 0; i < num; i++) - percpu_counter_destroy(&counters[i]); -} - -int nfsd_stat_counters_init(struct nfsd_net *nn) -{ - return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM); -} - -void nfsd_stat_counters_destroy(struct nfsd_net *nn) -{ - nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM); -} - void nfsd_proc_stat_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index d2753e975dfd..04aacb6c36e2 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,11 +10,6 @@ #include <uapi/linux/nfsd/stats.h> #include <linux/percpu_counter.h> -int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); -void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); -void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); -int nfsd_stat_counters_init(struct nfsd_net *nn); -void nfsd_stat_counters_destroy(struct nfsd_net *nn); void nfsd_proc_stat_init(struct net *net); void nfsd_proc_stat_shutdown(struct net *net); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 1cd2076210b1..77bbd23aa150 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -104,7 +104,7 @@ TRACE_EVENT(nfsd_compound, TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->opcnt = opcnt; - __assign_str(tag, tag); + __assign_str(tag); ), TP_printk("xid=0x%08x opcnt=%u tag=%s", __entry->xid, __entry->opcnt, __get_str(tag) @@ -127,7 +127,7 @@ TRACE_EVENT(nfsd_compound_status, __entry->args_opcnt = args_opcnt; __entry->resp_opcnt = resp_opcnt; __entry->status = be32_to_cpu(status); - __assign_str(name, name); + __assign_str(name); ), TP_printk("op=%u/%u %s status=%d", __entry->resp_opcnt, __entry->args_opcnt, @@ -318,7 +318,7 @@ TRACE_EVENT(nfsd_exp_find_key, TP_fast_assign( __entry->fsidtype = key->ek_fsidtype; memcpy(__entry->fsid, key->ek_fsid, 4*6); - __assign_str(auth_domain, key->ek_client->name); + __assign_str(auth_domain); __entry->status = status; ), TP_printk("fsid=%x::%s domain=%s status=%d", @@ -342,8 +342,8 @@ TRACE_EVENT(nfsd_expkey_update, TP_fast_assign( __entry->fsidtype = key->ek_fsidtype; memcpy(__entry->fsid, key->ek_fsid, 4*6); - __assign_str(auth_domain, key->ek_client->name); - __assign_str(path, exp_path); + __assign_str(auth_domain); + __assign_str(path); __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); ), TP_printk("fsid=%x::%s domain=%s path=%s cache=%s", @@ -365,8 +365,8 @@ TRACE_EVENT(nfsd_exp_get_by_name, __field(int, status) ), TP_fast_assign( - __assign_str(path, key->ex_path.dentry->d_name.name); - __assign_str(auth_domain, key->ex_client->name); + __assign_str(path); + __assign_str(auth_domain); __entry->status = status; ), TP_printk("path=%s domain=%s status=%d", @@ -385,8 +385,8 @@ TRACE_EVENT(nfsd_export_update, __field(bool, cache) ), TP_fast_assign( - __assign_str(path, key->ex_path.dentry->d_name.name); - __assign_str(auth_domain, key->ex_client->name); + __assign_str(path); + __assign_str(auth_domain); __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); ), TP_printk("path=%s domain=%s cache=%s", @@ -485,7 +485,7 @@ TRACE_EVENT(nfsd_dirent, TP_fast_assign( __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; __entry->ino = ino; - __assign_str(name, name); + __assign_str(name); ), TP_printk("fh_hash=0x%08x ino=%llu name=%s", __entry->fh_hash, __entry->ino, __get_str(name) @@ -749,6 +749,76 @@ TRACE_EVENT_CONDITION(nfsd_seq4_status, ) ); +DECLARE_EVENT_CLASS(nfsd_cs_slot_class, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfsd4_create_session *cs + ), + TP_ARGS(clp, cs), + TP_STRUCT__entry( + __field(u32, seqid) + __field(u32, slot_seqid) + __field(u32, cl_boot) + __field(u32, cl_id) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + const struct nfsd4_clid_slot *slot = &clp->cl_cs_slot; + + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen); + __entry->seqid = cs->seqid; + __entry->slot_seqid = slot->sl_seqid; + ), + TP_printk("addr=%pISpc client %08x:%08x seqid=%u slot_seqid=%u", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->seqid, __entry->slot_seqid + ) +); + +#define DEFINE_CS_SLOT_EVENT(name) \ +DEFINE_EVENT(nfsd_cs_slot_class, nfsd_##name, \ + TP_PROTO( \ + const struct nfs4_client *clp, \ + const struct nfsd4_create_session *cs \ + ), \ + TP_ARGS(clp, cs)) + +DEFINE_CS_SLOT_EVENT(slot_seqid_conf); +DEFINE_CS_SLOT_EVENT(slot_seqid_unconf); + +TRACE_EVENT(nfsd_slot_seqid_sequence, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfsd4_sequence *seq, + const struct nfsd4_slot *slot + ), + TP_ARGS(clp, seq, slot), + TP_STRUCT__entry( + __field(u32, seqid) + __field(u32, slot_seqid) + __field(u32, cl_boot) + __field(u32, cl_id) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + __field(bool, in_use) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen); + __entry->seqid = seq->seqid; + __entry->slot_seqid = slot->sl_seqid; + ), + TP_printk("addr=%pISpc client %08x:%08x seqid=%u slot_seqid=%u (%sin use)", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->seqid, __entry->slot_seqid, + __entry->in_use ? "" : "not " + ) +); + DECLARE_EVENT_CLASS(nfsd_clientid_class, TP_PROTO(const clientid_t *clid), TP_ARGS(clid), @@ -778,6 +848,30 @@ DEFINE_CLIENTID_EVENT(purged); DEFINE_CLIENTID_EVENT(renew); DEFINE_CLIENTID_EVENT(stale); +TRACE_EVENT(nfsd_mark_client_expired, + TP_PROTO( + const struct nfs4_client *clp, + int cl_rpc_users + ), + TP_ARGS(clp, cl_rpc_users), + TP_STRUCT__entry( + __field(int, cl_rpc_users) + __field(u32, cl_boot) + __field(u32, cl_id) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_rpc_users = cl_rpc_users; + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x cl_rpc_users=%d", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->cl_rpc_users) +); + DECLARE_EVENT_CLASS(nfsd_net_class, TP_PROTO(const struct nfsd_net *nn), TP_ARGS(nn), @@ -906,7 +1000,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __entry->flavor = clp->cl_cred.cr_flavor; memcpy(__entry->verifier, (void *)&clp->cl_verifier, NFS4_VERIFIER_SIZE); - __assign_str(name, clp->cl_name.data); + __assign_str(name); ), TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", __entry->addr, __get_str(name), @@ -1425,7 +1519,7 @@ TRACE_EVENT(nfsd_cb_setup, TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; - __assign_str(netid, netid); + __assign_str(netid); __entry->authflavor = authflavor; __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, clp->cl_cb_conn.cb_addrlen) @@ -1534,7 +1628,7 @@ TRACE_EVENT(nfsd_cb_seq_status, __entry->seq_status = cb->cb_seq_status; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER - " sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d\n", + " sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d", __entry->task_id, __entry->client_id, __entry->cl_boot, __entry->cl_id, __entry->seqno, __entry->reserved, @@ -1573,7 +1667,7 @@ TRACE_EVENT(nfsd_cb_free_slot, __entry->slot_seqno = session->se_cb_seq_nr; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER - " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u\n", + " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u", __entry->task_id, __entry->client_id, __entry->cl_boot, __entry->cl_id, __entry->seqno, __entry->reserved, @@ -1770,7 +1864,7 @@ TRACE_EVENT(nfsd_ctl_unlock_ip, ), TP_fast_assign( __entry->netns_ino = net->ns.inum; - __assign_str(address, address); + __assign_str(address); ), TP_printk("address=%s", __get_str(address) @@ -1789,7 +1883,7 @@ TRACE_EVENT(nfsd_ctl_unlock_fs, ), TP_fast_assign( __entry->netns_ino = net->ns.inum; - __assign_str(path, path); + __assign_str(path); ), TP_printk("path=%s", __get_str(path) @@ -1813,8 +1907,8 @@ TRACE_EVENT(nfsd_ctl_filehandle, TP_fast_assign( __entry->netns_ino = net->ns.inum; __entry->maxsize = maxsize; - __assign_str(domain, domain); - __assign_str(path, path); + __assign_str(domain); + __assign_str(path); ), TP_printk("domain=%s path=%s maxsize=%d", __get_str(domain), __get_str(path), __entry->maxsize @@ -1874,7 +1968,7 @@ TRACE_EVENT(nfsd_ctl_version, ), TP_fast_assign( __entry->netns_ino = net->ns.inum; - __assign_str(mesg, mesg); + __assign_str(mesg); ), TP_printk("%s", __get_str(mesg) @@ -1915,7 +2009,7 @@ TRACE_EVENT(nfsd_ctl_ports_addxprt, TP_fast_assign( __entry->netns_ino = net->ns.inum; __entry->port = port; - __assign_str(transport, transport); + __assign_str(transport); ), TP_printk("transport=%s port=%d", __get_str(transport), __entry->port @@ -1976,9 +2070,9 @@ TRACE_EVENT(nfsd_ctl_time, TP_fast_assign( __entry->netns_ino = net->ns.inum; __entry->time = time; - __assign_str(name, name); + __assign_str(name); ), - TP_printk("file=%s time=%d\n", + TP_printk("file=%s time=%d", __get_str(name), __entry->time ) ); @@ -1995,7 +2089,7 @@ TRACE_EVENT(nfsd_ctl_recoverydir, ), TP_fast_assign( __entry->netns_ino = net->ns.inum; - __assign_str(recdir, recdir); + __assign_str(recdir); ), TP_printk("recdir=%s", __get_str(recdir) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2e41eb4c3cec..29b1f3613800 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1422,7 +1422,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, * Callers expect new file metadata to be committed even * if the attributes have not changed. */ - if (iap->ia_valid) + if (nfsd_attrs_valid(attrs)) status = nfsd_setattr(rqstp, resfhp, attrs, NULL); else status = nfserrno(commit_metadata(resfhp)); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index c60fdb6200fd..57cd70062048 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -60,6 +60,14 @@ static inline void nfsd_attrs_free(struct nfsd_attrs *attrs) posix_acl_release(attrs->na_dpacl); } +static inline bool nfsd_attrs_valid(struct nfsd_attrs *attrs) +{ + struct iattr *iap = attrs->na_iattr; + + return (iap->ia_valid || (attrs->na_seclabel && + attrs->na_seclabel->len)); +} + __be32 nfserrno (int errno); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 415516c1b27e..fbdd42cde1fa 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -518,6 +518,24 @@ struct nfsd4_free_stateid { stateid_t fr_stateid; /* request */ }; +struct nfsd4_get_dir_delegation { + /* request */ + u32 gdda_signal_deleg_avail; + u32 gdda_notification_types[1]; + struct timespec64 gdda_child_attr_delay; + struct timespec64 gdda_dir_attr_delay; + u32 gdda_child_attributes[3]; + u32 gdda_dir_attributes[3]; + /* response */ + u32 gddrnf_status; + nfs4_verifier gddr_cookieverf; + stateid_t gddr_stateid; + u32 gddr_notification[1]; + u32 gddr_child_attributes[3]; + u32 gddr_dir_attributes[3]; + bool gddrnf_will_signal_deleg_avail; +}; + /* also used for NVERIFY */ struct nfsd4_verify { u32 ve_bmval[3]; /* request */ @@ -674,8 +692,10 @@ struct nfsd4_copy { #define NFSD4_COPY_F_INTRA (1) #define NFSD4_COPY_F_SYNCHRONOUS (2) #define NFSD4_COPY_F_COMMITTED (3) +#define NFSD4_COPY_F_COMPLETED (4) /* response */ + __be32 nfserr; struct nfsd42_write_res cp_res; struct knfsd_fh fh; @@ -735,7 +755,8 @@ struct nfsd4_offload_status { /* response */ u64 count; - u32 status; + __be32 status; + bool completed; }; struct nfsd4_copy_notify { @@ -797,6 +818,7 @@ struct nfsd4_op { struct nfsd4_reclaim_complete reclaim_complete; struct nfsd4_test_stateid test_stateid; struct nfsd4_free_stateid free_stateid; + struct nfsd4_get_dir_delegation get_dir_delegation; struct nfsd4_getdeviceinfo getdeviceinfo; struct nfsd4_layoutget layoutget; struct nfsd4_layoutcommit layoutcommit; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 65659fa0372e..a139970e4804 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1857,13 +1857,22 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, } /** - * nilfs_btree_convert_and_insert - - * @bmap: - * @key: - * @ptr: - * @keys: - * @ptrs: - * @n: + * nilfs_btree_convert_and_insert - Convert and insert entries into a B-tree + * @btree: NILFS B-tree structure + * @key: Key of the new entry to be inserted + * @ptr: Pointer (block number) associated with the key to be inserted + * @keys: Array of keys to be inserted in addition to @key + * @ptrs: Array of pointers associated with @keys + * @n: Number of keys and pointers in @keys and @ptrs + * + * This function is used to insert a new entry specified by @key and @ptr, + * along with additional entries specified by @keys and @ptrs arrays, into a + * NILFS B-tree. + * It prepares the necessary changes by allocating the required blocks and any + * necessary intermediate nodes. It converts configurations from other forms of + * block mapping (the one that currently exists is direct mapping) to a B-tree. + * + * Return: 0 on success or a negative error code on failure. */ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index aee40db7a036..a002a44ff161 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -174,7 +174,6 @@ Eend: dir->i_ino, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode)); fail: - folio_set_error(folio); return false; } diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index bf9a11d58817..1c9ae36a03ab 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -175,6 +175,7 @@ int nilfs_init_gcinode(struct inode *inode) /** * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes + * @nilfs: NILFS filesystem instance */ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs) { diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 2e29b98ba8ba..728e90be3570 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -335,8 +335,8 @@ void __nilfs_error(struct super_block *sb, const char *function, extern struct nilfs_super_block * nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); -extern int nilfs_store_magic_and_option(struct super_block *, - struct nilfs_super_block *, char *); +extern int nilfs_store_magic(struct super_block *sb, + struct nilfs_super_block *sbp); extern int nilfs_check_feature_compatibility(struct super_block *, struct nilfs_super_block *); extern void nilfs_set_log_cursor(struct nilfs_super_block *, diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 49a70c68bf3c..b638dc06df2f 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -563,6 +563,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, * checkpoint * @nilfs: nilfs object * @sb: super block instance + * @root: NILFS root instance * @ri: pointer to a nilfs_recovery_info */ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, @@ -698,9 +699,15 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, return; bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); - BUG_ON(!bh); + if (WARN_ON(!bh)) + return; /* should never happen */ + + lock_buffer(bh); memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); set_buffer_dirty(bh); + unlock_buffer(bh); + err = sync_dirty_buffer(bh); if (unlikely(err)) nilfs_warn(nilfs->ns_sb, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index aa5290cb7467..6be7dd423fbd 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1725,14 +1725,8 @@ static void nilfs_end_folio_io(struct folio *folio, int err) return; } - if (!err) { - if (!nilfs_folio_buffers_clean(folio)) - filemap_dirty_folio(folio->mapping, folio); - folio_clear_error(folio); - } else { + if (err || !nilfs_folio_buffers_clean(folio)) filemap_dirty_folio(folio->mapping, folio); - folio_set_error(folio); - } folio_end_writeback(folio); } @@ -2790,7 +2784,7 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) if (!nilfs->ns_writer) return -ENOMEM; - inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL); + inode_attach_wb(nilfs->ns_bdev->bd_mapping->host, NULL); err = nilfs_segctor_start_thread(nilfs->ns_writer); if (unlikely(err)) diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index ac24ed109ce9..e835e1f5a712 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -29,13 +29,13 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> -#include <linux/parser.h> #include <linux/crc32.h> #include <linux/vfs.h> #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "nilfs.h" #include "export.h" #include "mdt.h" @@ -61,7 +61,6 @@ struct kmem_cache *nilfs_segbuf_cachep; struct kmem_cache *nilfs_btree_path_cache; static int nilfs_setup_super(struct super_block *sb, int is_mount); -static int nilfs_remount(struct super_block *sb, int *flags, char *data); void __nilfs_msg(struct super_block *sb, const char *fmt, ...) { @@ -702,105 +701,98 @@ static const struct super_operations nilfs_sops = { .freeze_fs = nilfs_freeze, .unfreeze_fs = nilfs_unfreeze, .statfs = nilfs_statfs, - .remount_fs = nilfs_remount, .show_options = nilfs_show_options }; enum { - Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, - Opt_discard, Opt_nodiscard, Opt_err, + Opt_err, Opt_barrier, Opt_snapshot, Opt_order, Opt_norecovery, + Opt_discard, }; -static match_table_t tokens = { - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_snapshot, "cp=%u"}, - {Opt_order, "order=%s"}, - {Opt_norecovery, "norecovery"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_err, NULL} +static const struct constant_table nilfs_param_err[] = { + {"continue", NILFS_MOUNT_ERRORS_CONT}, + {"panic", NILFS_MOUNT_ERRORS_PANIC}, + {"remount-ro", NILFS_MOUNT_ERRORS_RO}, + {} }; -static int parse_options(char *options, struct super_block *sb, int is_remount) -{ - struct the_nilfs *nilfs = sb->s_fs_info; - char *p; - substring_t args[MAX_OPT_ARGS]; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; +static const struct fs_parameter_spec nilfs_param_spec[] = { + fsparam_enum ("errors", Opt_err, nilfs_param_err), + fsparam_flag_no ("barrier", Opt_barrier), + fsparam_u64 ("cp", Opt_snapshot), + fsparam_string ("order", Opt_order), + fsparam_flag ("norecovery", Opt_norecovery), + fsparam_flag_no ("discard", Opt_discard), + {} +}; - if (!*p) - continue; +struct nilfs_fs_context { + unsigned long ns_mount_opt; + __u64 cno; +}; - token = match_token(p, tokens, args); - switch (token) { - case Opt_barrier: - nilfs_set_opt(nilfs, BARRIER); - break; - case Opt_nobarrier: +static int nilfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct nilfs_fs_context *nilfs = fc->fs_private; + int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, nilfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_barrier: + if (result.negated) nilfs_clear_opt(nilfs, BARRIER); - break; - case Opt_order: - if (strcmp(args[0].from, "relaxed") == 0) - /* Ordered data semantics */ - nilfs_clear_opt(nilfs, STRICT_ORDER); - else if (strcmp(args[0].from, "strict") == 0) - /* Strict in-order semantics */ - nilfs_set_opt(nilfs, STRICT_ORDER); - else - return 0; - break; - case Opt_err_panic: - nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC); - break; - case Opt_err_ro: - nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO); - break; - case Opt_err_cont: - nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT); - break; - case Opt_snapshot: - if (is_remount) { - nilfs_err(sb, - "\"%s\" option is invalid for remount", - p); - return 0; - } - break; - case Opt_norecovery: - nilfs_set_opt(nilfs, NORECOVERY); - break; - case Opt_discard: - nilfs_set_opt(nilfs, DISCARD); - break; - case Opt_nodiscard: - nilfs_clear_opt(nilfs, DISCARD); - break; - default: - nilfs_err(sb, "unrecognized mount option \"%s\"", p); - return 0; + else + nilfs_set_opt(nilfs, BARRIER); + break; + case Opt_order: + if (strcmp(param->string, "relaxed") == 0) + /* Ordered data semantics */ + nilfs_clear_opt(nilfs, STRICT_ORDER); + else if (strcmp(param->string, "strict") == 0) + /* Strict in-order semantics */ + nilfs_set_opt(nilfs, STRICT_ORDER); + else + return -EINVAL; + break; + case Opt_err: + nilfs->ns_mount_opt &= ~NILFS_MOUNT_ERROR_MODE; + nilfs->ns_mount_opt |= result.uint_32; + break; + case Opt_snapshot: + if (is_remount) { + struct super_block *sb = fc->root->d_sb; + + nilfs_err(sb, + "\"%s\" option is invalid for remount", + param->key); + return -EINVAL; + } + if (result.uint_64 == 0) { + nilfs_err(NULL, + "invalid option \"cp=0\": invalid checkpoint number 0"); + return -EINVAL; } + nilfs->cno = result.uint_64; + break; + case Opt_norecovery: + nilfs_set_opt(nilfs, NORECOVERY); + break; + case Opt_discard: + if (result.negated) + nilfs_clear_opt(nilfs, DISCARD); + else + nilfs_set_opt(nilfs, DISCARD); + break; + default: + return -EINVAL; } - return 1; -} - -static inline void -nilfs_set_default_options(struct super_block *sb, - struct nilfs_super_block *sbp) -{ - struct the_nilfs *nilfs = sb->s_fs_info; - nilfs->ns_mount_opt = - NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; + return 0; } static int nilfs_setup_super(struct super_block *sb, int is_mount) @@ -857,9 +849,8 @@ struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); } -int nilfs_store_magic_and_option(struct super_block *sb, - struct nilfs_super_block *sbp, - char *data) +int nilfs_store_magic(struct super_block *sb, + struct nilfs_super_block *sbp) { struct the_nilfs *nilfs = sb->s_fs_info; @@ -870,14 +861,12 @@ int nilfs_store_magic_and_option(struct super_block *sb, sb->s_flags |= SB_NOATIME; #endif - nilfs_set_default_options(sb, sbp); - nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid); nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid); nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval); nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max); - return !parse_options(data, sb, 0) ? -EINVAL : 0; + return 0; } int nilfs_check_feature_compatibility(struct super_block *sb, @@ -1035,17 +1024,17 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) /** * nilfs_fill_super() - initialize a super block instance * @sb: super_block - * @data: mount options - * @silent: silent mode flag + * @fc: filesystem context * * This function is called exclusively by nilfs->ns_mount_mutex. * So, the recovery process is protected from other simultaneous mounts. */ static int -nilfs_fill_super(struct super_block *sb, void *data, int silent) +nilfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct the_nilfs *nilfs; struct nilfs_root *fsroot; + struct nilfs_fs_context *ctx = fc->fs_private; __u64 cno; int err; @@ -1055,10 +1044,13 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = nilfs; - err = init_nilfs(nilfs, sb, (char *)data); + err = init_nilfs(nilfs, sb); if (err) goto failed_nilfs; + /* Copy in parsed mount options */ + nilfs->ns_mount_opt = ctx->ns_mount_opt; + sb->s_op = &nilfs_sops; sb->s_export_op = &nilfs_export_ops; sb->s_root = NULL; @@ -1117,34 +1109,25 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) return err; } -static int nilfs_remount(struct super_block *sb, int *flags, char *data) +static int nilfs_reconfigure(struct fs_context *fc) { + struct nilfs_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; struct the_nilfs *nilfs = sb->s_fs_info; - unsigned long old_sb_flags; - unsigned long old_mount_opt; int err; sync_filesystem(sb); - old_sb_flags = sb->s_flags; - old_mount_opt = nilfs->ns_mount_opt; - - if (!parse_options(data, sb, 1)) { - err = -EINVAL; - goto restore_opts; - } - sb->s_flags = (sb->s_flags & ~SB_POSIXACL); err = -EINVAL; if (!nilfs_valid_fs(nilfs)) { nilfs_warn(sb, "couldn't remount because the filesystem is in an incomplete recovery state"); - goto restore_opts; + goto ignore_opts; } - - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) goto out; - if (*flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { sb->s_flags |= SB_RDONLY; /* @@ -1172,138 +1155,67 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) "couldn't remount RDWR because of unsupported optional features (%llx)", (unsigned long long)features); err = -EROFS; - goto restore_opts; + goto ignore_opts; } sb->s_flags &= ~SB_RDONLY; root = NILFS_I(d_inode(sb->s_root))->i_root; err = nilfs_attach_log_writer(sb, root); - if (err) - goto restore_opts; + if (err) { + sb->s_flags |= SB_RDONLY; + goto ignore_opts; + } down_write(&nilfs->ns_sem); nilfs_setup_super(sb, true); up_write(&nilfs->ns_sem); } out: - return 0; - - restore_opts: - sb->s_flags = old_sb_flags; - nilfs->ns_mount_opt = old_mount_opt; - return err; -} - -struct nilfs_super_data { - __u64 cno; - int flags; -}; - -static int nilfs_parse_snapshot_option(const char *option, - const substring_t *arg, - struct nilfs_super_data *sd) -{ - unsigned long long val; - const char *msg = NULL; - int err; - - if (!(sd->flags & SB_RDONLY)) { - msg = "read-only option is not specified"; - goto parse_error; - } - - err = kstrtoull(arg->from, 0, &val); - if (err) { - if (err == -ERANGE) - msg = "too large checkpoint number"; - else - msg = "malformed argument"; - goto parse_error; - } else if (val == 0) { - msg = "invalid checkpoint number 0"; - goto parse_error; - } - sd->cno = val; - return 0; - -parse_error: - nilfs_err(NULL, "invalid option \"%s\": %s", option, msg); - return 1; -} - -/** - * nilfs_identify - pre-read mount options needed to identify mount instance - * @data: mount options - * @sd: nilfs_super_data - */ -static int nilfs_identify(char *data, struct nilfs_super_data *sd) -{ - char *p, *options = data; - substring_t args[MAX_OPT_ARGS]; - int token; - int ret = 0; - - do { - p = strsep(&options, ","); - if (p != NULL && *p) { - token = match_token(p, tokens, args); - if (token == Opt_snapshot) - ret = nilfs_parse_snapshot_option(p, &args[0], - sd); - } - if (!options) - break; - BUG_ON(options == data); - *(options - 1) = ','; - } while (!ret); - return ret; -} + sb->s_flags = (sb->s_flags & ~SB_POSIXACL); + /* Copy over parsed remount options */ + nilfs->ns_mount_opt = ctx->ns_mount_opt; -static int nilfs_set_bdev_super(struct super_block *s, void *data) -{ - s->s_dev = *(dev_t *)data; return 0; -} -static int nilfs_test_bdev_super(struct super_block *s, void *data) -{ - return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; + ignore_opts: + return err; } -static struct dentry * -nilfs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int +nilfs_get_tree(struct fs_context *fc) { - struct nilfs_super_data sd = { .flags = flags }; + struct nilfs_fs_context *ctx = fc->fs_private; struct super_block *s; dev_t dev; int err; - if (nilfs_identify(data, &sd)) - return ERR_PTR(-EINVAL); + if (ctx->cno && !(fc->sb_flags & SB_RDONLY)) { + nilfs_err(NULL, + "invalid option \"cp=%llu\": read-only option is not specified", + ctx->cno); + return -EINVAL; + } - err = lookup_bdev(dev_name, &dev); + err = lookup_bdev(fc->source, &dev); if (err) - return ERR_PTR(err); + return err; - s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags, - &dev); + s = sget_dev(fc, dev); if (IS_ERR(s)) - return ERR_CAST(s); + return PTR_ERR(s); if (!s->s_root) { - err = setup_bdev_super(s, flags, NULL); + err = setup_bdev_super(s, fc->sb_flags, fc); if (!err) - err = nilfs_fill_super(s, data, - flags & SB_SILENT ? 1 : 0); + err = nilfs_fill_super(s, fc); if (err) goto failed_super; s->s_flags |= SB_ACTIVE; - } else if (!sd.cno) { + } else if (!ctx->cno) { if (nilfs_tree_is_busy(s->s_root)) { - if ((flags ^ s->s_flags) & SB_RDONLY) { + if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { nilfs_err(s, "the device already has a %s mount.", sb_rdonly(s) ? "read-only" : "read/write"); @@ -1312,37 +1224,75 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } } else { /* - * Try remount to setup mount states if the current + * Try reconfigure to setup mount states if the current * tree is not mounted and only snapshots use this sb. + * + * Since nilfs_reconfigure() requires fc->root to be + * set, set it first and release it on failure. */ - err = nilfs_remount(s, &flags, data); - if (err) + fc->root = dget(s->s_root); + err = nilfs_reconfigure(fc); + if (err) { + dput(fc->root); + fc->root = NULL; /* prevent double release */ goto failed_super; + } + return 0; } } - if (sd.cno) { + if (ctx->cno) { struct dentry *root_dentry; - err = nilfs_attach_snapshot(s, sd.cno, &root_dentry); + err = nilfs_attach_snapshot(s, ctx->cno, &root_dentry); if (err) goto failed_super; - return root_dentry; + fc->root = root_dentry; + return 0; } - return dget(s->s_root); + fc->root = dget(s->s_root); + return 0; failed_super: deactivate_locked_super(s); - return ERR_PTR(err); + return err; +} + +static void nilfs_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations nilfs_context_ops = { + .parse_param = nilfs_parse_param, + .get_tree = nilfs_get_tree, + .reconfigure = nilfs_reconfigure, + .free = nilfs_free_fc, +}; + +static int nilfs_init_fs_context(struct fs_context *fc) +{ + struct nilfs_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ns_mount_opt = NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; + fc->fs_private = ctx; + fc->ops = &nilfs_context_ops; + + return 0; } struct file_system_type nilfs_fs_type = { .owner = THIS_MODULE, .name = "nilfs2", - .mount = nilfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = nilfs_init_fs_context, + .parameters = nilfs_param_spec, }; MODULE_ALIAS_FS("nilfs2"); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2ae2c1bbf6d1..f41d7b6d432c 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -592,7 +592,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev); - int valid[2], swp = 0; + int valid[2], swp = 0, older; if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) { nilfs_err(sb, "device size too small"); @@ -648,9 +648,25 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, if (swp) nilfs_swap_super_block(nilfs); + /* + * Calculate the array index of the older superblock data. + * If one has been dropped, set index 0 pointing to the remaining one, + * otherwise set index 1 pointing to the old one (including if both + * are the same). + * + * Divided case valid[0] valid[1] swp -> older + * ------------------------------------------------------------- + * Both SBs are invalid 0 0 N/A (Error) + * SB1 is invalid 0 1 1 0 + * SB2 is invalid 1 0 0 0 + * SB2 is newer 1 1 1 0 + * SB2 is older or the same 1 1 0 1 + */ + older = valid[1] ^ swp; + nilfs->ns_sbwcount = 0; nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); - nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); + nilfs->ns_prot_seq = le64_to_cpu(sbp[older]->s_last_seq); *sbpp = sbp[0]; return 0; } @@ -659,7 +675,6 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, * init_nilfs - initialize a NILFS instance. * @nilfs: the_nilfs structure * @sb: super block - * @data: mount options * * init_nilfs() performs common initialization per block device (e.g. * reading the super block, getting disk layout information, initializing @@ -668,7 +683,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, * Return Value: On success, 0 is returned. On error, a negative error * code is returned. */ -int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) +int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb) { struct nilfs_super_block *sbp; int blocksize; @@ -686,7 +701,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto out; - err = nilfs_store_magic_and_option(sb, sbp, data); + err = nilfs_store_magic(sb, sbp); if (err) goto failed_sbh; diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index cd4ae1b8ae16..85da0629415d 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -219,10 +219,6 @@ THE_NILFS_FNS(PURGING, purging) #define nilfs_set_opt(nilfs, opt) \ ((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt) #define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt) -#define nilfs_write_opt(nilfs, mask, opt) \ - ((nilfs)->ns_mount_opt = \ - (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \ - NILFS_MOUNT_##opt)) \ /** * struct nilfs_root - nilfs root object @@ -276,7 +272,7 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); struct the_nilfs *alloc_nilfs(struct super_block *sb); void destroy_nilfs(struct the_nilfs *nilfs); -int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data); +int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb); int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb); unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs); void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs); diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 3464fa7e8538..f3669403fabf 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -162,7 +162,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) if (!S_ISDIR(inode->i_mode)) return; - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); + fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); @@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) fsnotify_group_lock(dnotify_group); /* add the new_fsn_mark or find an old one. */ - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); + fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index fbdc63cc10d9..9ec313e9f6e1 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1076,7 +1076,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, } static int fanotify_remove_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, __u32 mask, + void *obj, unsigned int obj_type, __u32 mask, unsigned int flags, __u32 umask) { struct fsnotify_mark *fsn_mark = NULL; @@ -1084,7 +1084,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group, int destroy_mark; fsnotify_group_lock(group); - fsn_mark = fsnotify_find_mark(connp, group); + fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { fsnotify_group_unlock(group); return -ENOENT; @@ -1105,30 +1105,6 @@ static int fanotify_remove_mark(struct fsnotify_group *group, return 0; } -static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt, __u32 mask, - unsigned int flags, __u32 umask) -{ - return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - mask, flags, umask); -} - -static int fanotify_remove_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags, __u32 umask) -{ - return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, - flags, umask); -} - -static int fanotify_remove_inode_mark(struct fsnotify_group *group, - struct inode *inode, __u32 mask, - unsigned int flags, __u32 umask) -{ - return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask, - flags, umask); -} - static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { @@ -1249,7 +1225,7 @@ static int fanotify_set_mark_fsid(struct fsnotify_group *group, } static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, + void *obj, unsigned int obj_type, unsigned int fan_flags, struct fan_fsid *fsid) @@ -1288,7 +1264,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; } - ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0); + ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0); if (ret) goto out_put_mark; @@ -1344,7 +1320,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, } static int fanotify_add_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, unsigned int obj_type, + void *obj, unsigned int obj_type, __u32 mask, unsigned int fan_flags, struct fan_fsid *fsid) { @@ -1353,9 +1329,9 @@ static int fanotify_add_mark(struct fsnotify_group *group, int ret = 0; fsnotify_group_lock(group); - fsn_mark = fsnotify_find_mark(connp, group); + fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { - fsn_mark = fanotify_add_new_mark(group, connp, obj_type, + fsn_mark = fanotify_add_new_mark(group, obj, obj_type, fan_flags, fsid); if (IS_ERR(fsn_mark)) { fsnotify_group_unlock(group); @@ -1392,42 +1368,6 @@ out: return ret; } -static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt, __u32 mask, - unsigned int flags, struct fan_fsid *fsid) -{ - return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); -} - -static int fanotify_add_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags, struct fan_fsid *fsid) -{ - return fanotify_add_mark(group, &sb->s_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); -} - -static int fanotify_add_inode_mark(struct fsnotify_group *group, - struct inode *inode, __u32 mask, - unsigned int flags, struct fan_fsid *fsid) -{ - pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); - - /* - * If some other task has this inode open for write we should not add - * an ignore mask, unless that ignore mask is supposed to survive - * modification changes anyway. - */ - if ((flags & FANOTIFY_MARK_IGNORE_BITS) && - !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && - inode_is_open_for_write(inode)) - return 0; - - return fanotify_add_mark(group, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); -} - static struct fsnotify_event *fanotify_alloc_overflow_event(void) { struct fanotify_event *oevent; @@ -1576,13 +1516,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) INIT_LIST_HEAD(&group->fanotify_data.access_list); switch (class) { case FAN_CLASS_NOTIF: - group->priority = FS_PRIO_0; + group->priority = FSNOTIFY_PRIO_NORMAL; break; case FAN_CLASS_CONTENT: - group->priority = FS_PRIO_1; + group->priority = FSNOTIFY_PRIO_CONTENT; break; case FAN_CLASS_PRE_CONTENT: - group->priority = FS_PRIO_2; + group->priority = FSNOTIFY_PRIO_PRE_CONTENT; break; default: fd = -EINVAL; @@ -1750,6 +1690,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; + void *obj; u32 umask = 0; int ret; @@ -1833,12 +1774,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; /* - * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not - * allowed to set permissions events. + * Permission events require minimum priority FAN_CLASS_CONTENT. */ ret = -EINVAL; if (mask & FANOTIFY_PERM_EVENTS && - group->priority == FS_PRIO_0) + group->priority < FSNOTIFY_PRIO_CONTENT) goto fput_and_out; if (mask & FAN_FS_ERROR && @@ -1908,17 +1848,34 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* inode held in place by reference to path; group by fget on fd */ - if (mark_type == FAN_MARK_INODE) + if (mark_type == FAN_MARK_INODE) { inode = path.dentry->d_inode; - else + obj = inode; + } else { mnt = path.mnt; + if (mark_type == FAN_MARK_MOUNT) + obj = mnt; + else + obj = mnt->mnt_sb; + } - ret = mnt ? -EINVAL : -EISDIR; - /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ - if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE && - (mnt || S_ISDIR(inode->i_mode)) && - !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) - goto path_put_and_out; + /* + * If some other task has this inode open for write we should not add + * an ignore mask, unless that ignore mask is supposed to survive + * modification changes anyway. + */ + if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { + ret = mnt ? -EINVAL : -EISDIR; + /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ + if (ignore == FAN_MARK_IGNORE && + (mnt || S_ISDIR(inode->i_mode))) + goto path_put_and_out; + + ret = 0; + if (inode && inode_is_open_for_write(inode)) + goto path_put_and_out; + } /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { @@ -1936,26 +1893,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, /* create/update an inode mark */ switch (mark_cmd) { case FAN_MARK_ADD: - if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_add_vfsmount_mark(group, mnt, mask, - flags, fsid); - else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, - flags, fsid); - else - ret = fanotify_add_inode_mark(group, inode, mask, - flags, fsid); + ret = fanotify_add_mark(group, obj, obj_type, mask, flags, + fsid); break; case FAN_MARK_REMOVE: - if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_remove_vfsmount_mark(group, mnt, mask, - flags, umask); - else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, - flags, umask); - else - ret = fanotify_remove_inode_mark(group, inode, mask, - flags, umask); + ret = fanotify_remove_mark(group, obj, obj_type, mask, flags, + umask); break; default: ret = -EINVAL; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 5c430736ec12..dec553034027 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -41,29 +41,25 @@ static void show_fdinfo(struct seq_file *m, struct file *f, #if defined(CONFIG_EXPORTFS) static void show_mark_fhandle(struct seq_file *m, struct inode *inode) { - struct { - struct file_handle handle; - u8 pad[MAX_HANDLE_SZ]; - } f; + DEFINE_FLEX(struct file_handle, f, f_handle, handle_bytes, MAX_HANDLE_SZ); int size, ret, i; - f.handle.handle_bytes = sizeof(f.pad); - size = f.handle.handle_bytes >> 2; + size = f->handle_bytes >> 2; - ret = exportfs_encode_fid(inode, (struct fid *)f.handle.f_handle, &size); + ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size); if ((ret == FILEID_INVALID) || (ret < 0)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return; } - f.handle.handle_type = ret; - f.handle.handle_bytes = size * sizeof(u32); + f->handle_type = ret; + f->handle_bytes = size * sizeof(u32); seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:", - f.handle.handle_bytes, f.handle.handle_type); + f->handle_bytes, f->handle_type); - for (i = 0; i < f.handle.handle_bytes; i++) - seq_printf(m, "%02x", (int)f.handle.f_handle[i]); + for (i = 0; i < f->handle_bytes; i++) + seq_printf(m, "%02x", (int)f->f_handle[i]); } #else static void show_mark_fhandle(struct seq_file *m, struct inode *inode) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 2fc105a72a8f..ff69ae24c4e8 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -89,11 +89,25 @@ static void fsnotify_unmount_inodes(struct super_block *sb) void fsnotify_sb_delete(struct super_block *sb) { + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + /* Were any marks ever added to any object on this sb? */ + if (!sbinfo) + return; + fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ - wait_var_event(&sb->s_fsnotify_connectors, - !atomic_long_read(&sb->s_fsnotify_connectors)); + wait_var_event(fsnotify_sb_watched_objects(sb), + !atomic_long_read(fsnotify_sb_watched_objects(sb))); + WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); + WARN_ON(fsnotify_sb_has_priority_watchers(sb, + FSNOTIFY_PRIO_PRE_CONTENT)); +} + +void fsnotify_sb_free(struct super_block *sb) +{ + kfree(sb->s_fsnotify_info); } /* @@ -489,6 +503,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; @@ -525,7 +540,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ - if (!sb->s_fsnotify_marks && + if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks)) @@ -552,8 +567,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = - fsnotify_first_mark(&sb->s_fsnotify_marks); + if (sbinfo) { + iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = + fsnotify_first_mark(&sbinfo->sb_marks); + } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index fde74eb333cc..2d059f789ee3 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -9,39 +9,58 @@ #include "../mount.h" +/* + * fsnotify_connp_t is what we embed in objects which connector can be attached + * to. + */ +typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; + static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { - return container_of(conn->obj, struct inode, i_fsnotify_marks); + return conn->obj; } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { - return container_of(conn->obj, struct mount, mnt_fsnotify_marks); + return real_mount(conn->obj); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { - return container_of(conn->obj, struct super_block, s_fsnotify_marks); + return conn->obj; } -static inline struct super_block *fsnotify_connector_sb( - struct fsnotify_mark_connector *conn) +static inline struct super_block *fsnotify_object_sb(void *obj, + enum fsnotify_obj_type obj_type) { - switch (conn->type) { + switch (obj_type) { case FSNOTIFY_OBJ_TYPE_INODE: - return fsnotify_conn_inode(conn)->i_sb; + return ((struct inode *)obj)->i_sb; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: - return fsnotify_conn_mount(conn)->mnt.mnt_sb; + return ((struct vfsmount *)obj)->mnt_sb; case FSNOTIFY_OBJ_TYPE_SB: - return fsnotify_conn_sb(conn); + return (struct super_block *)obj; default: return NULL; } } +static inline struct super_block *fsnotify_connector_sb( + struct fsnotify_mark_connector *conn) +{ + return fsnotify_object_sb(conn->obj, conn->type); +} + +static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb) +{ + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + return sbinfo ? &sbinfo->sb_marks : NULL; +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -67,7 +86,7 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { - fsnotify_destroy_marks(&sb->s_fsnotify_marks); + fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } /* diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 85d8fdd55329..4ffc30606e0b 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -544,7 +544,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, int create = (arg & IN_MASK_CREATE); int ret; - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); + fsn_mark = fsnotify_find_inode_mark(inode, group); if (!fsn_mark) return -ENOENT; else if (create) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d6944ff86ffa..c3eefa70633c 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -97,6 +97,21 @@ void fsnotify_get_mark(struct fsnotify_mark *mark) refcount_inc(&mark->refcnt); } +static fsnotify_connp_t *fsnotify_object_connp(void *obj, + enum fsnotify_obj_type obj_type) +{ + switch (obj_type) { + case FSNOTIFY_OBJ_TYPE_INODE: + return &((struct inode *)obj)->i_fsnotify_marks; + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + return &real_mount(obj)->mnt_fsnotify_marks; + case FSNOTIFY_OBJ_TYPE_SB: + return fsnotify_sb_marks(obj); + default: + return NULL; + } +} + static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) { if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) @@ -116,10 +131,69 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) return *fsnotify_conn_mask_p(conn); } +static void fsnotify_get_sb_watched_objects(struct super_block *sb) +{ + atomic_long_inc(fsnotify_sb_watched_objects(sb)); +} + +static void fsnotify_put_sb_watched_objects(struct super_block *sb) +{ + if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb))) + wake_up_var(fsnotify_sb_watched_objects(sb)); +} + static void fsnotify_get_inode_ref(struct inode *inode) { ihold(inode); - atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); + fsnotify_get_sb_watched_objects(inode->i_sb); +} + +static void fsnotify_put_inode_ref(struct inode *inode) +{ + fsnotify_put_sb_watched_objects(inode->i_sb); + iput(inode); +} + +/* + * Grab or drop watched objects reference depending on whether the connector + * is attached and has any marks attached. + */ +static void fsnotify_update_sb_watchers(struct super_block *sb, + struct fsnotify_mark_connector *conn) +{ + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED; + struct fsnotify_mark *first_mark = NULL; + unsigned int highest_prio = 0; + + if (conn->obj) + first_mark = hlist_entry_safe(conn->list.first, + struct fsnotify_mark, obj_list); + if (first_mark) + highest_prio = first_mark->group->priority; + if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM)) + highest_prio = 0; + + /* + * If the highest priority of group watching this object is prio, + * then watched object has a reference on counters [0..prio]. + * Update priority >= 1 watched objects counters. + */ + for (unsigned int p = conn->prio + 1; p <= highest_prio; p++) + atomic_long_inc(&sbinfo->watched_objects[p]); + for (unsigned int p = conn->prio; p > highest_prio; p--) + atomic_long_dec(&sbinfo->watched_objects[p]); + conn->prio = highest_prio; + + /* Update priority >= 0 (a.k.a total) watched objects counter */ + BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0); + if (first_mark && !is_watched) { + conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED; + fsnotify_get_sb_watched_objects(sb); + } else if (!first_mark && is_watched) { + conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED; + fsnotify_put_sb_watched_objects(sb); + } } /* @@ -213,35 +287,12 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } -static void fsnotify_put_inode_ref(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - - iput(inode); - if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) - wake_up_var(&sb->s_fsnotify_connectors); -} - -static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn) -{ - struct super_block *sb = fsnotify_connector_sb(conn); - - if (sb) - atomic_long_inc(&sb->s_fsnotify_connectors); -} - -static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) -{ - struct super_block *sb = fsnotify_connector_sb(conn); - - if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) - wake_up_var(&sb->s_fsnotify_connectors); -} - static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) { + fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type); + struct super_block *sb = fsnotify_connector_sb(conn); struct inode *inode = NULL; *type = conn->type; @@ -261,10 +312,10 @@ static void *fsnotify_detach_connector_from_object( fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } - fsnotify_put_sb_connectors(conn); - rcu_assign_pointer(*(conn->obj), NULL); + rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; + fsnotify_update_sb_watchers(sb, conn); return inode; } @@ -316,6 +367,11 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { + struct super_block *sb = fsnotify_connector_sb(conn); + + /* Update watched objects after detaching mark */ + if (sb) + fsnotify_update_sb_watchers(sb, conn); objp = __fsnotify_recalc_mask(conn); type = conn->type; } @@ -536,8 +592,28 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) return -1; } +static int fsnotify_attach_info_to_sb(struct super_block *sb) +{ + struct fsnotify_sb_info *sbinfo; + + /* sb info is freed on fsnotify_sb_delete() */ + sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + + /* + * cmpxchg() provides the barrier so that callers of fsnotify_sb_info() + * will observe an initialized structure + */ + if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) { + /* Someone else created sbinfo for us */ + kfree(sbinfo); + } + return 0; +} + static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int obj_type) + void *obj, unsigned int obj_type) { struct fsnotify_mark_connector *conn; @@ -547,10 +623,9 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; + conn->prio = 0; conn->type = obj_type; - conn->obj = connp; - conn->flags = 0; - fsnotify_get_sb_connectors(conn); + conn->obj = obj; /* * cmpxchg() provides the barrier so that readers of *connp can see @@ -558,10 +633,8 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ - fsnotify_put_sb_connectors(conn); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } - return 0; } @@ -598,24 +671,36 @@ out: * to which group and for which inodes. These marks are ordered according to * priority, highest number first, and then by the group's location in memory. */ -static int fsnotify_add_mark_list(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, +static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { + struct super_block *sb = fsnotify_object_sb(obj, obj_type); struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; + fsnotify_connp_t *connp; int cmp; int err = 0; if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; + /* + * Attach the sb info before attaching a connector to any object on sb. + * The sb info will remain attached as long as sb lives. + */ + if (!fsnotify_sb_info(sb)) { + err = fsnotify_attach_info_to_sb(sb); + if (err) + return err; + } + + connp = fsnotify_object_connp(obj, obj_type); restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, obj_type); + err = fsnotify_attach_connector_to_object(connp, obj, obj_type); if (err) return err; goto restart; @@ -649,6 +734,7 @@ restart: /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: + fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone @@ -667,7 +753,7 @@ out_err: * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int obj_type, + void *obj, unsigned int obj_type, int add_flags) { struct fsnotify_group *group = mark->group; @@ -688,7 +774,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags); + ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags); if (ret) goto err; @@ -706,14 +792,14 @@ err: return ret; } -int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, +int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); - ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags); + ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } @@ -723,12 +809,16 @@ EXPORT_SYMBOL_GPL(fsnotify_add_mark); * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ -struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, +struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group) { + fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type); struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; + if (!connp) + return NULL; + conn = fsnotify_grab_connector(connp); if (!conn) return NULL; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b82185075de7..f0467d3b3c88 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2283,8 +2283,6 @@ unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out: - if (ret < 0) - ret = -EIO; return ret; } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 960080753d3b..2b8fa3e782fb 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1784,6 +1784,9 @@ static int o2net_accept_one(struct socket *sock, int *more) struct o2nm_node *node = NULL; struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + }; struct o2net_node *nn; unsigned int nofs_flag; @@ -1802,7 +1805,7 @@ static int o2net_accept_one(struct socket *sock, int *more) new_sock->type = sock->type; new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false); + ret = sock->ops->accept(sock, new_sock, &arg); if (ret < 0) goto out; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 5c04dde99981..2018501b2249 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1274,7 +1274,7 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, { struct dlm_query_nodeinfo *qn; struct dlm_ctxt *dlm = NULL; - int locked = 0, status = -EINVAL; + int status = -EINVAL; qn = (struct dlm_query_nodeinfo *) msg->buf; @@ -1290,12 +1290,11 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, } spin_lock(&dlm->spinlock); - locked = 1; if (dlm->joining_node != qn->qn_nodenum) { mlog(ML_ERROR, "Node %d queried nodes on domain %s but " "joining node is %d\n", qn->qn_nodenum, qn->qn_domain, dlm->joining_node); - goto bail; + goto unlock; } /* Support for node query was added in 1.1 */ @@ -1305,14 +1304,14 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, "but active dlm protocol is %d.%d\n", qn->qn_nodenum, qn->qn_domain, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); - goto bail; + goto unlock; } status = dlm_match_nodes(dlm, qn); +unlock: + spin_unlock(&dlm->spinlock); bail: - if (locked) - spin_unlock(&dlm->spinlock); spin_unlock(&dlm_domain_lock); return status; @@ -1528,7 +1527,6 @@ static void dlm_send_join_asserts(struct dlm_ctxt *dlm, { int status, node, live; - status = 0; node = -1; while ((node = find_next_bit(node_map, O2NM_MAX_NODES, node + 1)) < O2NM_MAX_NODES) { diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index b8b6a191b5cb..96b684763b39 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -255,9 +255,9 @@ static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb, if (fh_len < 3 || fh_type > 2) return NULL; - handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32; - handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]); - handle.ih_generation = le32_to_cpu(fid->raw[2]); + handle.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[0]) << 32; + handle.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[1]); + handle.ih_generation = le32_to_cpu((__force __le32)fid->raw[2]); return ocfs2_get_dentry(sb, &handle); } @@ -269,9 +269,9 @@ static struct dentry *ocfs2_fh_to_parent(struct super_block *sb, if (fh_type != 2 || fh_len < 6) return NULL; - parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32; - parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]); - parent.ih_generation = le32_to_cpu(fid->raw[5]); + parent.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[3]) << 32; + parent.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[4]); + parent.ih_generation = le32_to_cpu((__force __le32)fid->raw[5]); return ocfs2_get_dentry(sb, &parent); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0da8e7bd3261..ccc57038a977 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1936,6 +1936,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, inode_lock(inode); + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); /* * This prevents concurrent writes on other nodes */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 999111bfc271..2cc5c99fe941 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1621,6 +1621,7 @@ static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info } static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) +__acquires(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); @@ -1628,6 +1629,7 @@ static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) } static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) +__releases(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index b1550ba73f96..71beef7f8a60 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -125,6 +125,7 @@ int ocfs2_fileattr_set(struct mnt_idmap *idmap, ocfs2_inode->ip_attr = flags; ocfs2_set_inode_flags(inode); + inode_set_ctime_current(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); if (status < 0) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index c803c10dd97e..5df34561c551 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -212,14 +212,15 @@ static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters) { - spin_lock(&osb->osb_lock); - if (osb->local_alloc_state == OCFS2_LA_DISABLED || - osb->local_alloc_state == OCFS2_LA_THROTTLED) - if (num_clusters >= osb->local_alloc_default_bits) { + if (num_clusters >= osb->local_alloc_default_bits) { + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) { cancel_delayed_work(&osb->la_enable_wq); osb->local_alloc_state = OCFS2_LA_ENABLED; } - spin_unlock(&osb->osb_lock); + spin_unlock(&osb->osb_lock); + } } void ocfs2_la_enable_worker(struct work_struct *work) @@ -335,7 +336,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) "found = %u, set = %u, taken = %u, off = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), le32_to_cpu(alloc->id1.bitmap1.i_total), - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off)); status = -EINVAL; goto bail; @@ -863,14 +864,8 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, numfound = bitoff = startoff = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { - if (bitoff == left) { - /* mlog(0, "bitoff (%d) == left", bitoff); */ - break; - } - /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " - "numfound = %d\n", bitoff, startoff, numfound);*/ - + while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) < + left) { /* Ok, we found a zero bit... is it contig. or do we * start over?*/ if (bitoff == startoff) { @@ -976,9 +971,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, start = count = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) - != -1) { - if ((bit_off < left) && (bit_off == start)) { + while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) < + left) { + if (bit_off == start) { count++; start++; continue; @@ -1002,8 +997,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, goto bail; } } - if (bit_off >= left) - break; + count = 1; start = bit_off + 1; } @@ -1220,7 +1214,7 @@ retry_enospc: OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); trace_ocfs2_local_alloc_new_window_result( - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off, + le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off), le32_to_cpu(alloc->id1.bitmap1.i_total)); bail: diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 1f9ed117e78b..f9d6a4f9ca92 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -685,7 +685,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, - goal_bit, len); + goal_bit, len, 0, 0); if (ret) { ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, le16_to_cpu(gd->bg_chain)); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 9221a33f917b..4d1ea8703fcd 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -566,7 +566,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); - ktime_get_real_ts64(&ts); + ktime_get_coarse_real_ts64(&ts); fe->i_atime = fe->i_ctime = fe->i_mtime = cpu_to_le64(ts.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = @@ -797,6 +797,7 @@ static int ocfs2_link(struct dentry *old_dentry, ocfs2_set_links_count(fe, inode->i_nlink); fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, @@ -993,6 +994,7 @@ static int ocfs2_unlink(struct inode *dir, drop_nlink(inode); drop_nlink(inode); ocfs2_set_links_count(fe, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 7aebdbf5cc0a..c93689b568fe 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -883,7 +883,8 @@ struct ocfs2_group_desc __le16 bg_free_bits_count; /* Free bits count */ __le16 bg_chain; /* What chain I am in. */ /*10*/ __le32 bg_generation; - __le32 bg_reserved1; + __le16 bg_contig_free_bits; /* max contig free bits length */ + __le16 bg_reserved1; __le64 bg_next_group; /* Next group in my list, in blocks */ /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 9898c11bdfa1..60e208b01c8d 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -82,7 +82,7 @@ DECLARE_EVENT_CLASS(ocfs2__string, __string(name,name) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); ), TP_printk("%s", __get_str(name)) ); @@ -1289,7 +1289,7 @@ DECLARE_EVENT_CLASS(ocfs2__file_ops, __entry->dentry = dentry; __entry->ino = ino; __entry->d_len = d_len; - __assign_str(d_name, d_name); + __assign_str(d_name); __entry->para = para; ), TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file, @@ -1425,7 +1425,7 @@ TRACE_EVENT(ocfs2_setattr, __entry->dentry = dentry; __entry->ino = ino; __entry->d_len = d_len; - __assign_str(d_name, d_name); + __assign_str(d_name); __entry->ia_valid = ia_valid; __entry->ia_mode = ia_mode; __entry->ia_uid = ia_uid; @@ -1683,7 +1683,7 @@ TRACE_EVENT(ocfs2_parse_options, ), TP_fast_assign( __entry->is_remount = is_remount; - __assign_str(options, options); + __assign_str(options); ), TP_printk("%d %s", __entry->is_remount, __get_str(options)) ); @@ -1718,8 +1718,8 @@ TRACE_EVENT(ocfs2_initialize_super, __field(int, cluster_bits) ), TP_fast_assign( - __assign_str(label, label); - __assign_str(uuid_str, uuid_str); + __assign_str(label); + __assign_str(uuid_str); __entry->root_dir = root_dir; __entry->system_dir = system_dir; __entry->cluster_bits = cluster_bits; @@ -1746,7 +1746,7 @@ TRACE_EVENT(ocfs2_init_xattr_set_ctxt, __field(int, credits) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->meta = meta; __entry->clusters = clusters; __entry->credits = credits; @@ -1770,7 +1770,7 @@ DECLARE_EVENT_CLASS(ocfs2__xattr_find, ), TP_fast_assign( __entry->ino = ino; - __assign_str(name, name); + __assign_str(name); __entry->name_index = name_index; __entry->hash = hash; __entry->location = location; @@ -2019,7 +2019,7 @@ TRACE_EVENT(ocfs2_sync_dquot_helper, __entry->dq_id = dq_id; __entry->dq_type = dq_type; __entry->type = type; - __assign_str(s_id, s_id); + __assign_str(s_id); ), TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type, __entry->type, __get_str(s_id)) @@ -2060,7 +2060,7 @@ TRACE_EVENT(ocfs2_dx_dir_search, TP_fast_assign( __entry->ino = ino; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->major_hash = major_hash; __entry->minor_hash = minor_hash; __entry->blkno = blkno; @@ -2088,7 +2088,7 @@ TRACE_EVENT(ocfs2_find_files_on_disk, ), TP_fast_assign( __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->blkno = blkno; __entry->dir = dir; ), @@ -2107,7 +2107,7 @@ TRACE_EVENT(ocfs2_check_dir_for_entry, TP_fast_assign( __entry->dir = dir; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%llu %.*s", __entry->dir, __entry->namelen, __get_str(name)) @@ -2135,7 +2135,7 @@ TRACE_EVENT(ocfs2_dx_dir_index_root_block, __entry->major_hash = major_hash; __entry->minor_hash = minor_hash; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->num_used = num_used; ), TP_printk("%llu %x %x %.*s %u", __entry->dir, @@ -2171,7 +2171,7 @@ DECLARE_EVENT_CLASS(ocfs2__dentry_ops, __entry->dir = dir; __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->dir_blkno = dir_blkno; __entry->extra = extra; ), @@ -2217,7 +2217,7 @@ TRACE_EVENT(ocfs2_mknod, __entry->dir = dir; __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->dir_blkno = dir_blkno; __entry->dev = dev; __entry->mode = mode; @@ -2241,9 +2241,9 @@ TRACE_EVENT(ocfs2_link, TP_fast_assign( __entry->ino = ino; __entry->old_len = old_len; - __assign_str(old_name, old_name); + __assign_str(old_name); __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%llu %.*s %.*s", __entry->ino, __entry->old_len, __get_str(old_name), @@ -2279,9 +2279,9 @@ TRACE_EVENT(ocfs2_rename, __entry->new_dir = new_dir; __entry->new_dentry = new_dentry; __entry->old_len = old_len; - __assign_str(old_name, old_name); + __assign_str(old_name); __entry->new_len = new_len; - __assign_str(new_name, new_name); + __assign_str(new_name); ), TP_printk("%p %p %p %p %.*s %.*s", __entry->old_dir, __entry->old_dentry, @@ -2301,7 +2301,7 @@ TRACE_EVENT(ocfs2_rename_target_exists, ), TP_fast_assign( __entry->new_len = new_len; - __assign_str(new_name, new_name); + __assign_str(new_name); ), TP_printk("%.*s", __entry->new_len, __get_str(new_name)) ); @@ -2344,7 +2344,7 @@ TRACE_EVENT(ocfs2_symlink_begin, __entry->dentry = dentry; __entry->symname = symname; __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry, __entry->symname, __entry->len, __get_str(name)) @@ -2360,7 +2360,7 @@ TRACE_EVENT(ocfs2_blkno_stringify, ), TP_fast_assign( __entry->blkno = blkno; - __assign_str(name, name); + __assign_str(name); __entry->namelen = namelen; ), TP_printk("%llu %s %d", __entry->blkno, __get_str(name), @@ -2381,7 +2381,7 @@ TRACE_EVENT(ocfs2_orphan_del, ), TP_fast_assign( __entry->dir = dir; - __assign_str(name, name); + __assign_str(name); __entry->namelen = namelen; ), TP_printk("%llu %s %d", __entry->dir, __get_str(name), @@ -2403,7 +2403,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate, TP_fast_assign( __entry->dentry = dentry; __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name)) ); @@ -2420,7 +2420,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate_negative, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->pgen = pgen; __entry->gen = gen; ), @@ -2445,7 +2445,7 @@ TRACE_EVENT(ocfs2_find_local_alias, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%.*s", __entry->len, __get_str(name)) ); @@ -2462,7 +2462,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->parent = parent; __entry->fsdata = fsdata; ), @@ -2480,7 +2480,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock_found, __field(unsigned long long, ino) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->parent = parent; __entry->ino = ino; ), @@ -2527,7 +2527,7 @@ TRACE_EVENT(ocfs2_get_parent, TP_fast_assign( __entry->child = child; __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->ino = ino; ), TP_printk("%p %.*s %llu", __entry->child, __entry->len, @@ -2551,7 +2551,7 @@ TRACE_EVENT(ocfs2_encode_fh_begin, TP_fast_assign( __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->fh = fh; __entry->len = len; __entry->connectable = connectable; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3f80a56d0d60..1f303b1adf1a 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -630,7 +630,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, rb->rf_records.rl_count = cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); spin_lock(&osb->osb_lock); - rb->rf_generation = osb->s_next_generation++; + rb->rf_generation = cpu_to_le32(osb->s_next_generation++); spin_unlock(&osb->osb_lock); ocfs2_journal_dirty(handle, new_bh); diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index a9d1296d736d..1fe61974d9f0 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -414,7 +414,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, start = search_start; while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, - start)) != -1) { + start)) < resmap->m_bitmap_len) { /* Search reached end of the region */ if (offset >= (search_start + search_len)) break; diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index d65d43c61857..c4a4016d3866 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -91,6 +91,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, u16 cl_bpc = le16_to_cpu(cl->cl_bpc); u16 cl_cpg = le16_to_cpu(cl->cl_cpg); u16 old_bg_clusters; + u16 contig_bits; + __le16 old_bg_contig_free_bits; trace_ocfs2_update_last_group_and_inode(new_clusters, first_new_cluster); @@ -122,6 +124,11 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } + contig_bits = ocfs2_find_max_contig_free_bits(group->bg_bitmap, + le16_to_cpu(group->bg_bits), 0); + old_bg_contig_free_bits = group->bg_contig_free_bits; + group->bg_contig_free_bits = cpu_to_le16(contig_bits); + ocfs2_journal_dirty(handle, group_bh); /* update the inode accordingly. */ @@ -160,6 +167,7 @@ out_rollback: le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); + group->bg_contig_free_bits = old_bg_contig_free_bits; } out: if (ret) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 166c8918c825..f7b483f0de2a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -50,6 +50,10 @@ struct ocfs2_suballoc_result { u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ + unsigned int sr_max_contig_bits; /* The length for contiguous + * free bits, only available + * for cluster group + */ }; static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) @@ -1272,6 +1276,26 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, return ret; } +u16 ocfs2_find_max_contig_free_bits(void *bitmap, + u16 total_bits, u16 start) +{ + u16 offset, free_bits; + u16 contig_bits = 0; + + while (start < total_bits) { + offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); + if (offset == total_bits) + break; + + start = ocfs2_find_next_bit(bitmap, total_bits, offset); + free_bits = start - offset; + if (contig_bits < free_bits) + contig_bits = free_bits; + } + + return contig_bits; +} + static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, @@ -1280,6 +1304,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, { void *bitmap; u16 best_offset, best_size; + u16 prev_best_size = 0; int offset, start, found, status = 0; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; @@ -1290,10 +1315,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; - while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { - if (offset == total_bits) - break; - + while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < + total_bits) { if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { /* We found a zero, but we can't use it as it * hasn't been put to disk yet! */ @@ -1308,6 +1331,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, /* got a zero after some ones */ found = 1; start = offset + 1; + prev_best_size = best_size; } if (found > best_size) { best_size = found; @@ -1320,6 +1344,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, } } + /* best_size will be allocated, we save prev_best_size */ + res->sr_max_contig_bits = prev_best_size; if (best_size) { res->sr_bit_offset = best_offset; res->sr_bits = best_size; @@ -1337,11 +1363,16 @@ int ocfs2_block_group_set_bits(handle_t *handle, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, unsigned int bit_off, - unsigned int num_bits) + unsigned int num_bits, + unsigned int max_contig_bits, + int fastpath) { int status; void *bitmap = bg->bg_bitmap; int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + unsigned int start = bit_off + num_bits; + u16 contig_bits; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); /* All callers get the descriptor via * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -1373,6 +1404,29 @@ int ocfs2_block_group_set_bits(handle_t *handle, while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); + /* + * this is optimize path, caller set old contig value + * in max_contig_bits to bypass finding action. + */ + if (fastpath) { + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { + /* + * Usually, the block group bitmap allocates only 1 bit + * at a time, while the cluster group allocates n bits + * each time. Therefore, we only save the contig bits for + * the cluster group. + */ + contig_bits = ocfs2_find_max_contig_free_bits(bitmap, + le16_to_cpu(bg->bg_bits), start); + if (contig_bits > max_contig_bits) + max_contig_bits = contig_bits; + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); + } else { + bg->bg_contig_free_bits = 0; + } + ocfs2_journal_dirty(handle, group_bh); bail: @@ -1486,7 +1540,12 @@ static int ocfs2_cluster_group_search(struct inode *inode, BUG_ON(!ocfs2_is_cluster_bitmap(inode)); - if (gd->bg_free_bits_count) { + if (le16_to_cpu(gd->bg_contig_free_bits) && + le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) + return -ENOSPC; + + /* ->bg_contig_free_bits may un-initialized, so compare again */ + if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { max_bits = le16_to_cpu(gd->bg_bits); /* Tail groups in cluster bitmaps which aren't cpg @@ -1530,13 +1589,6 @@ static int ocfs2_cluster_group_search(struct inode *inode, * of bits. */ if (min_bits <= res->sr_bits) search = 0; /* success */ - else if (res->sr_bits) { - /* - * Don't show bits which we'll be returning - * for allocation to the local alloc bitmap. - */ - ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits); - } } return search; @@ -1555,7 +1607,7 @@ static int ocfs2_block_group_search(struct inode *inode, BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) { + if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), @@ -1715,7 +1767,8 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, - res->sr_bit_offset, res->sr_bits); + res->sr_bit_offset, res->sr_bits, + res->sr_max_contig_bits, 0); if (ret < 0) { ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, res->sr_bits, @@ -1849,7 +1902,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bg, group_bh, res->sr_bit_offset, - res->sr_bits); + res->sr_bits, + res->sr_max_contig_bits, + 0); if (status < 0) { ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, res->sr_bits, chain); @@ -1951,7 +2006,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { if (i == victim) continue; - if (!cl->cl_recs[i].c_free) + if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) continue; ac->ac_chain = i; @@ -2163,7 +2218,9 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, bg, bg_bh, res->sr_bit_offset, - res->sr_bits); + res->sr_bits, + res->sr_max_contig_bits, + 0); if (ret < 0) { ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, ac->ac_bh, res->sr_bits, chain); @@ -2382,11 +2439,13 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, struct buffer_head *group_bh, unsigned int bit_off, unsigned int num_bits, + unsigned int max_contig_bits, void (*undo_fn)(unsigned int bit, unsigned long *bmap)) { int status; unsigned int tmp; + u16 contig_bits; struct ocfs2_group_desc *undo_bg = NULL; struct journal_head *jh; @@ -2433,6 +2492,20 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, num_bits); } + /* + * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), + * we still need to rescan whole bitmap. + */ + if (ocfs2_is_cluster_bitmap(alloc_inode)) { + contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, + le16_to_cpu(bg->bg_bits), 0); + if (contig_bits > max_contig_bits) + max_contig_bits = contig_bits; + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + } else { + bg->bg_contig_free_bits = 0; + } + if (undo_fn) spin_unlock(&jh->b_state_lock); @@ -2459,6 +2532,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *group; + __le16 old_bg_contig_free_bits = 0; /* The alloc_bh comes from ocfs2_free_dinode() or * ocfs2_free_clusters(). The callers have all locked the @@ -2483,9 +2557,11 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); + if (ocfs2_is_cluster_bitmap(alloc_inode)) + old_bg_contig_free_bits = group->bg_contig_free_bits; status = ocfs2_block_group_clear_bits(handle, alloc_inode, group, group_bh, - start_bit, count, undo_fn); + start_bit, count, 0, undo_fn); if (status < 0) { mlog_errno(status); goto bail; @@ -2496,7 +2572,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, if (status < 0) { mlog_errno(status); ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, - start_bit, count); + start_bit, count, + le16_to_cpu(old_bg_contig_free_bits), 1); goto bail; } diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 9c74eace3adc..b481b834857d 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -79,12 +79,16 @@ void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain); +u16 ocfs2_find_max_contig_free_bits(void *bitmap, + u16 total_bits, u16 start); int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, unsigned int bit_off, - unsigned int num_bits); + unsigned int num_bits, + unsigned int max_contig_bits, + int fastpath); int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, diff --git a/fs/open.c b/fs/open.c index ee8460c83c77..89cafb572061 100644 --- a/fs/open.c +++ b/fs/open.c @@ -902,10 +902,10 @@ cleanup_inode: } static int do_dentry_open(struct file *f, - struct inode *inode, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; + struct inode *inode = f->f_path.dentry->d_inode; int error; path_get(&f->f_path); @@ -1047,7 +1047,7 @@ int finish_open(struct file *file, struct dentry *dentry, BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; - return do_dentry_open(file, d_backing_inode(dentry), open); + return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); @@ -1086,7 +1086,7 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file) { file->f_path = *path; - return do_dentry_open(file, d_backing_inode(path->dentry), NULL); + return do_dentry_open(file, NULL); } struct file *dentry_open(const struct path *path, int flags, @@ -1155,7 +1155,6 @@ EXPORT_SYMBOL(dentry_create); * kernel_file_open - open a file for kernel internal use * @path: path of the file to open * @flags: open flags - * @inode: the inode * @cred: credentials for open * * Open a file for use by in-kernel consumers. The file is not accounted @@ -1165,7 +1164,7 @@ EXPORT_SYMBOL(dentry_create); * Return: Opened file on success, an error pointer on failure. */ struct file *kernel_file_open(const struct path *path, int flags, - struct inode *inode, const struct cred *cred) + const struct cred *cred) { struct file *f; int error; @@ -1175,7 +1174,7 @@ struct file *kernel_file_open(const struct path *path, int flags, return f; f->f_path = *path; - error = do_dentry_open(f, inode, NULL); + error = do_dentry_open(f, NULL); if (error) { fput(f); f = ERR_PTR(error); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 0f8b4a719237..116f542442dd 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -14,6 +14,7 @@ #include <linux/posix_acl_xattr.h> #include <linux/atomic.h> #include <linux/ratelimit.h> +#include <linux/backing-file.h> #include "overlayfs.h" static unsigned short ovl_redirect_max = 256; @@ -260,14 +261,13 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) * may not use to instantiate the new dentry. */ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, - struct dentry *newdentry, bool hardlink) + struct dentry *newdentry, bool hardlink, struct file *tmpfile) { struct ovl_inode_params oip = { .upperdentry = newdentry, .newinode = inode, }; - ovl_dir_modified(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); ovl_dentry_init_reval(dentry, newdentry, NULL); @@ -295,6 +295,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, inc_nlink(inode); } + if (tmpfile) + d_mark_tmpfile(tmpfile, inode); + d_instantiate(dentry, inode); if (inode != oip.newinode) { pr_warn_ratelimited("newly created inode found in cache (%pd2)\n", @@ -327,9 +330,6 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; - if (!attr->hardlink && !IS_POSIXACL(udir)) - attr->mode &= ~current_umask(); - inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = ovl_create_real(ofs, udir, ovl_lookup_upper(ofs, dentry->d_name.name, @@ -345,7 +345,8 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, ovl_set_opaque(dentry, newdentry); } - err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink); + ovl_dir_modified(dentry->d_parent, false); + err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink, NULL); if (err) goto out_cleanup; out_unlock: @@ -529,7 +530,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; } - err = ovl_instantiate(dentry, inode, newdentry, hardlink); + ovl_dir_modified(dentry->d_parent, false); + err = ovl_instantiate(dentry, inode, newdentry, hardlink, NULL); if (err) { ovl_cleanup(ofs, udir, newdentry); dput(newdentry); @@ -551,12 +553,35 @@ out_cleanup: goto out_dput; } +static int ovl_setup_cred_for_create(struct dentry *dentry, struct inode *inode, + umode_t mode, const struct cred *old_cred) +{ + int err; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + override_cred->fsuid = inode->i_uid; + override_cred->fsgid = inode->i_gid; + err = security_dentry_create_files_as(dentry, mode, &dentry->d_name, + old_cred, override_cred); + if (err) { + put_cred(override_cred); + return err; + } + put_cred(override_creds(override_cred)); + put_cred(override_cred); + + return 0; +} + static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, struct ovl_cattr *attr, bool origin) { int err; const struct cred *old_cred; - struct cred *override_cred; struct dentry *parent = dentry->d_parent; old_cred = ovl_override_creds(dentry->d_sb); @@ -572,10 +597,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, } if (!attr->hardlink) { - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_revert_creds; /* * In the creation cases(create, mkdir, mknod, symlink), * ovl should transfer current's fs{u,g}id to underlying @@ -589,17 +610,9 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, * create a new inode, so just use the ovl mounter's * fs{u,g}id. */ - override_cred->fsuid = inode->i_uid; - override_cred->fsgid = inode->i_gid; - err = security_dentry_create_files_as(dentry, - attr->mode, &dentry->d_name, old_cred, - override_cred); - if (err) { - put_cred(override_cred); + err = ovl_setup_cred_for_create(dentry, inode, attr->mode, old_cred); + if (err) goto out_revert_creds; - } - put_cred(override_creds(override_cred)); - put_cred(override_cred); } if (!ovl_dentry_is_whiteout(dentry)) @@ -1290,6 +1303,100 @@ out: return err; } +static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, + struct inode *inode, umode_t mode) +{ + const struct cred *old_cred; + struct path realparentpath; + struct file *realfile; + struct dentry *newdentry; + /* It's okay to set O_NOATIME, since the owner will be current fsuid */ + int flags = file->f_flags | OVL_OPEN_FLAGS; + int err; + + err = ovl_copy_up(dentry->d_parent); + if (err) + return err; + + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred); + if (err) + goto out_revert_creds; + + ovl_path_upper(dentry->d_parent, &realparentpath); + realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath, + mode, current_cred()); + err = PTR_ERR_OR_ZERO(realfile); + pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err); + if (err) + goto out_revert_creds; + + /* ovl_instantiate() consumes the newdentry reference on success */ + newdentry = dget(realfile->f_path.dentry); + err = ovl_instantiate(dentry, inode, newdentry, false, file); + if (!err) { + file->private_data = realfile; + } else { + dput(newdentry); + fput(realfile); + } +out_revert_creds: + revert_creds(old_cred); + return err; +} + +static int ovl_dummy_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir, + struct file *file, umode_t mode) +{ + int err; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode; + + if (!OVL_FS(dentry->d_sb)->tmpfile) + return -EOPNOTSUPP; + + err = ovl_want_write(dentry); + if (err) + return err; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode, 0); + if (!inode) + goto drop_write; + + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); + err = ovl_create_tmpfile(file, dentry, inode, inode->i_mode); + if (err) + goto put_inode; + + /* + * Check if the preallocated inode was actually used. Having something + * else assigned to the dentry shouldn't happen as that would indicate + * that the backing tmpfile "leaked" out of overlayfs. + */ + err = -EIO; + if (WARN_ON(inode != d_inode(dentry))) + goto put_realfile; + + /* inode reference was transferred to dentry */ + inode = NULL; + err = finish_open(file, dentry, ovl_dummy_open); +put_realfile: + /* Without FMODE_OPENED ->release() won't be called on @file */ + if (!(file->f_mode & FMODE_OPENED)) + fput(file->private_data); +put_inode: + iput(inode); +drop_write: + ovl_drop_write(dentry); + return err; +} + const struct inode_operations ovl_dir_inode_operations = { .lookup = ovl_lookup, .mkdir = ovl_mkdir, @@ -1310,4 +1417,5 @@ const struct inode_operations ovl_dir_inode_operations = { .update_time = ovl_update_time, .fileattr_get = ovl_fileattr_get, .fileattr_set = ovl_fileattr_set, + .tmpfile = ovl_tmpfile, }; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 05536964d37f..1a411cae57ed 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -24,9 +24,6 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) return 'm'; } -/* No atime modification on underlying */ -#define OVL_OPEN_FLAGS (O_NOATIME) - static struct file *ovl_open_realfile(const struct file *file, const struct path *realpath) { diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c63b31a460be..35fd3e3e1778 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -8,7 +8,6 @@ #include <linux/slab.h> #include <linux/cred.h> #include <linux/xattr.h> -#include <linux/posix_acl.h> #include <linux/ratelimit.h> #include <linux/fiemap.h> #include <linux/fileattr.h> diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index ee949f3e7c77..0bfe35da4b7b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -175,6 +175,9 @@ static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy) return (int)metacopy->len - OVL_METACOPY_MIN_SIZE; } +/* No atime modification on underlying */ +#define OVL_OPEN_FLAGS (O_NOATIME) + extern const char *const ovl_xattr_table[][2]; static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) { diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index d285d1d7baad..edc9216f6e27 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1376,7 +1376,7 @@ int ovl_ensure_verity_loaded(struct path *datapath) * If this inode was not yet opened, the verity info hasn't been * loaded yet, so we need to do that here to force it into memory. */ - filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred()); + filp = kernel_file_open(datapath, O_RDONLY, current_cred()); if (IS_ERR(filp)) return PTR_ERR(filp); fput(filp); diff --git a/fs/pidfs.c b/fs/pidfs.c index a63d5d24aa02..dbb9d854d1c5 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -169,6 +169,24 @@ static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return -EOPNOTSUPP; } + +/* + * User space expects pidfs inodes to have no file type in st_mode. + * + * In particular, 'lsof' has this legacy logic: + * + * type = s->st_mode & S_IFMT; + * switch (type) { + * ... + * case 0: + * if (!strcmp(p, "anon_inode")) + * Lf->ntype = Ntype = N_ANON_INODE; + * + * to detect our old anon_inode logic. + * + * Rather than mess with our internal sane inode data, just fix it + * up here in getattr() by masking off the format bits. + */ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) @@ -176,6 +194,7 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + stat->mode &= ~S_IFMT; return 0; } @@ -199,12 +218,13 @@ static const struct super_operations pidfs_sops = { .statfs = simple_statfs, }; +/* + * 'lsof' has knowledge of out historical anon_inode use, and expects + * the pidfs dentry name to start with 'anon_inode'. + */ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) { - struct inode *inode = d_inode(dentry); - struct pid *pid = inode->i_private; - - return dynamic_dname(buffer, buflen, "pidfd:[%llu]", pid->ino); + return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); } static const struct dentry_operations pidfs_dentry_operations = { diff --git a/fs/proc/fd.c b/fs/proc/fd.c index f4b1c8b42a51..586bbc84ca04 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -39,10 +39,8 @@ static int seq_show(struct seq_file *m, void *v) spin_lock(&files->file_lock); file = files_lookup_fd_locked(files, fd); if (file) { - struct fdtable *fdt = files_fdtable(files); - f_flags = file->f_flags; - if (close_on_exec(fd, fdt)) + if (close_on_exec(fd, files)) f_flags |= O_CLOEXEC; get_file(file); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index dcd513dccf55..d19434e2a58e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -451,15 +451,13 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo unsigned long len, unsigned long pgoff, unsigned long flags) { - typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; + if (pde->proc_ops->proc_get_unmapped_area) + return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); - get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU - if (!get_area) - get_area = current->mm->get_unmapped_area; + return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); #endif - if (get_area) - return get_area(file, orig_addr, len, pgoff, flags); + return orig_addr; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 45af9a989d40..245171d9164b 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); #ifdef CONFIG_ZSWAP - seq_printf(m, "Zswap: %8lu kB\n", - (unsigned long)(zswap_pool_total_size >> 10)); + show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", (unsigned long)atomic_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); diff --git a/fs/proc/page.c b/fs/proc/page.c index 9223856c934b..2fb64bdb64eb 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -107,10 +107,13 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) return ((kflags >> kbit) & 1) << ubit; } -u64 stable_page_flags(struct page *page) +u64 stable_page_flags(const struct page *page) { - u64 k; - u64 u; + const struct folio *folio; + unsigned long k; + unsigned long mapping; + bool is_anon; + u64 u = 0; /* * pseudo flag: KPF_NOPAGE @@ -118,49 +121,47 @@ u64 stable_page_flags(struct page *page) */ if (!page) return 1 << KPF_NOPAGE; + folio = page_folio(page); - k = page->flags; - u = 0; + k = folio->flags; + mapping = (unsigned long)folio->mapping; + is_anon = mapping & PAGE_MAPPING_ANON; /* * pseudo flags for the well known (anonymous) memory mapped pages */ if (page_mapped(page)) u |= 1 << KPF_MMAP; - if (PageAnon(page)) + if (is_anon) { u |= 1 << KPF_ANON; - if (PageKsm(page)) - u |= 1 << KPF_KSM; + if (mapping & PAGE_MAPPING_KSM) + u |= 1 << KPF_KSM; + } /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ - if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; - if (PageTail(page)) + if (page == &folio->page) + u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head); + else u |= 1 << KPF_COMPOUND_TAIL; - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) u |= 1 << KPF_HUGE; /* - * PageTransCompound can be true for non-huge compound pages (slab - * pages or pages allocated by drivers with __GFP_COMP) because it - * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon + * We need to check PageLRU/PageAnon * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - if (PageLRU(head) || PageAnon(head)) + else if (folio_test_large(folio)) { + if ((k & (1 << PG_lru)) || is_anon) u |= 1 << KPF_THP; - else if (is_huge_zero_page(head)) { + else if (is_huge_zero_folio(folio)) { u |= 1 << KPF_ZERO_PAGE; u |= 1 << KPF_THP; } } else if (is_zero_pfn(page_to_pfn(page))) u |= 1 << KPF_ZERO_PAGE; - /* * Caveats on high order pages: PG_buddy and PG_slab will only be set * on the head page. @@ -174,16 +175,17 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; + if (folio_test_slab(folio)) + u |= 1 << KPF_SLAB; - if (page_is_idle(page)) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) + u |= kpf_copy_bit(k, KPF_IDLE, PG_idle); +#else + if (folio_test_idle(folio)) u |= 1 << KPF_IDLE; +#endif u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - - u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - if (PageTail(page) && PageSlab(page)) - u |= 1 << KPF_SLAB; - u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); @@ -194,7 +196,8 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); - if (PageSwapCache(page)) +#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache)) + if ((k & SWAPCACHE) == SWAPCACHE) u |= 1 << KPF_SWAPCACHE; u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); @@ -202,7 +205,10 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); #ifdef CONFIG_MEMORY_FAILURE - u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); + if (u & (1 << KPF_HUGE)) + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); + else + u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison); #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED @@ -228,7 +234,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, { const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; - struct page *ppage; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; @@ -245,9 +250,9 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, * TODO: ZONE_DEVICE support requires to identify * memmaps that were actually initialized. */ - ppage = pfn_to_online_page(pfn); + struct page *page = pfn_to_online_page(pfn); - if (put_user(stable_page_flags(ppage), out)) { + if (put_user(stable_page_flags(page), out)) { ret = -EFAULT; break; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 102f48668c35..e5a5f015ff03 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -411,14 +411,14 @@ struct mem_size_stats { }; static void smaps_page_accumulate(struct mem_size_stats *mss, - struct page *page, unsigned long size, unsigned long pss, + struct folio *folio, unsigned long size, unsigned long pss, bool dirty, bool locked, bool private) { mss->pss += pss; - if (PageAnon(page)) + if (folio_test_anon(folio)) mss->pss_anon += pss; - else if (PageSwapBacked(page)) + else if (folio_test_swapbacked(folio)) mss->pss_shmem += pss; else mss->pss_file += pss; @@ -426,7 +426,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, if (locked) mss->pss_locked += pss; - if (dirty || PageDirty(page)) { + if (dirty || folio_test_dirty(folio)) { mss->pss_dirty += pss; if (private) mss->private_dirty += size; @@ -444,6 +444,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, bool migration) { + struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -451,27 +452,28 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * First accumulate quantities that depend only on |size| and the type * of the compound page. */ - if (PageAnon(page)) { + if (folio_test_anon(folio)) { mss->anonymous += size; - if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + if (!folio_test_swapbacked(folio) && !dirty && + !folio_test_dirty(folio)) mss->lazyfree += size; } - if (PageKsm(page)) + if (folio_test_ksm(folio)) mss->ksm += size; mss->resident += size; /* Accumulate the size in pages that have been accessed. */ - if (young || page_is_young(page) || PageReferenced(page)) + if (young || folio_test_young(folio) || folio_test_referenced(folio)) mss->referenced += size; /* * Then accumulate quantities that may depend on sharing, or that may * differ page-by-page. * - * page_count(page) == 1 guarantees the page is mapped exactly once. + * refcount == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate - * page_count(). + * the refcount. * * The page_mapcount() is called to get a snapshot of the mapcount. * Without holding the page lock this snapshot can be slightly wrong as @@ -480,9 +482,9 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * especially for migration entries. Treat regular migration entries * as mapcount == 1. */ - if ((page_count(page) == 1) || migration) { - smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, - locked, true); + if ((folio_ref_count(folio) == 1) || migration) { + smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, + dirty, locked, true); return; } for (i = 0; i < nr; i++, page++) { @@ -490,8 +492,8 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, unsigned long pss = PAGE_SIZE << PSS_SHIFT; if (mapcount >= 2) pss /= mapcount; - smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked, - mapcount < 2); + smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, + dirty, locked, mapcount < 2); } } @@ -576,6 +578,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + struct folio *folio; bool migration = false; if (pmd_present(*pmd)) { @@ -590,11 +593,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } if (IS_ERR_OR_NULL(page)) return; - if (PageAnon(page)) + folio = page_folio(page); + if (folio_test_anon(folio)) mss->anonymous_thp += HPAGE_PMD_SIZE; - else if (PageSwapBacked(page)) + else if (folio_test_swapbacked(folio)) mss->shmem_thp += HPAGE_PMD_SIZE; - else if (is_zone_device_page(page)) + else if (folio_is_zone_device(folio)) /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; @@ -726,19 +730,20 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; - struct page *page = NULL; - pte_t ptent = ptep_get(pte); + pte_t ptent = huge_ptep_get(pte); + struct folio *folio = NULL; if (pte_present(ptent)) { - page = vm_normal_page(vma, addr, ptent); + folio = page_folio(pte_page(ptent)); } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) - page = pfn_swap_entry_to_page(swpent); + folio = pfn_swap_entry_folio(swpent); } - if (page) { - if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) + if (folio) { + if (folio_likely_mapped_shared(folio) || + hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); @@ -866,8 +871,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, - true, THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); @@ -1161,7 +1166,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -1173,12 +1178,12 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pmd_present(*pmd)) goto out; - page = pmd_page(*pmd); + folio = pmd_folio(*pmd); /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); - test_and_clear_page_young(page); - ClearPageReferenced(page); + folio_test_clear_young(folio); + folio_clear_referenced(folio); out: spin_unlock(ptl); return 0; @@ -1200,14 +1205,14 @@ out: if (!pte_present(ptent)) continue; - page = vm_normal_page(vma, addr, ptent); - if (!page) + folio = vm_normal_folio(vma, addr, ptent); + if (!folio) continue; /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); - test_and_clear_page_young(page); - ClearPageReferenced(page); + folio_test_clear_young(folio); + folio_clear_referenced(folio); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -1574,12 +1579,13 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, pte = huge_ptep_get(ptep); if (pte_present(pte)) { - struct page *page = pte_page(pte); + struct folio *folio = page_folio(pte_page(pte)); - if (!PageAnon(page)) + if (!folio_test_anon(folio)) flags |= PM_FILE; - if (page_mapcount(page) == 1) + if (!folio_likely_mapped_shared(folio) && + !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; if (huge_pte_uffd_wp(pte)) @@ -2551,28 +2557,29 @@ struct numa_maps_private { static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { + struct folio *folio = page_folio(page); int count = page_mapcount(page); md->pages += nr_pages; - if (pte_dirty || PageDirty(page)) + if (pte_dirty || folio_test_dirty(folio)) md->dirty += nr_pages; - if (PageSwapCache(page)) + if (folio_test_swapcache(folio)) md->swapcache += nr_pages; - if (PageActive(page) || PageUnevictable(page)) + if (folio_test_active(folio) || folio_test_unevictable(folio)) md->active += nr_pages; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) md->writeback += nr_pages; - if (PageAnon(page)) + if (folio_test_anon(folio)) md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; - md->node[page_to_nid(page)] += nr_pages; + md->node[folio_nid(folio)] += nr_pages; } static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 1fb213f379a5..b52d85f8ad59 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -383,6 +383,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) /* leave now if filled buffer already */ if (!iov_iter_count(iter)) return acc; + + cond_resched(); } list_for_each_entry(m, &vmcore_list, list) { @@ -1370,9 +1372,8 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name); vdd_hdr->n_type = NT_VMCOREDD; - strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME, - sizeof(vdd_hdr->name)); - memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name)); + strscpy_pad(vdd_hdr->name, VMCOREDD_NOTE_NAME); + strscpy_pad(vdd_hdr->dump_name, data->dump_name); } /** diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index dacbee455c03..627eb2f72ef3 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -410,7 +410,7 @@ static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots) if (dquot) /* Even in case of error we have to continue */ ret = mark_dquot_dirty(dquot); - if (!err) + if (!err && ret < 0) err = ret; } return err; @@ -1737,7 +1737,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) if (reserve) goto out_flush_warn; - mark_all_dquot_dirty(dquots); + ret = mark_all_dquot_dirty(dquots); out_flush_warn: srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); @@ -1786,7 +1786,7 @@ int dquot_alloc_inode(struct inode *inode) warn_put_all: spin_unlock(&inode->i_lock); if (ret == 0) - mark_all_dquot_dirty(dquots); + ret = mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); return ret; @@ -1990,7 +1990,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) qsize_t inode_usage = 1; struct dquot __rcu **dquots; struct dquot *transfer_from[MAXQUOTAS] = {}; - int cnt, index, ret = 0; + int cnt, index, ret = 0, err; char is_valid[MAXQUOTAS] = {}; struct dquot_warn warn_to[MAXQUOTAS]; struct dquot_warn warn_from_inodes[MAXQUOTAS]; @@ -2087,8 +2087,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) * mark_all_dquot_dirty(). */ index = srcu_read_lock(&dquot_srcu); - mark_all_dquot_dirty((struct dquot __rcu **)transfer_from); - mark_all_dquot_dirty((struct dquot __rcu **)transfer_to); + err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_from); + if (err < 0) + ret = err; + err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_to); + if (err < 0) + ret = err; srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn_to); @@ -2098,7 +2102,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (is_valid[cnt]) transfer_to[cnt] = transfer_from[cnt]; - return 0; + return ret; over_quota: /* Back out changes we already did */ for (cnt--; cnt >= 0; cnt--) { @@ -2726,6 +2730,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) struct mem_dqblk *dm = &dquot->dq_dqb; int check_blim = 0, check_ilim = 0; struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; + int ret; if (di->d_fieldmask & ~VFS_QC_MASK) return -EINVAL; @@ -2807,8 +2812,9 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) else set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dquot->dq_dqb_lock); - mark_dquot_dirty(dquot); - + ret = mark_dquot_dirty(dquot); + if (ret < 0) + return ret; return 0; } @@ -3016,11 +3022,10 @@ static int __init dquot_init(void) if (!dquot_hash) panic("Cannot create dquot hash table"); - for (i = 0; i < _DQST_DQSTAT_LAST; i++) { - ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); - if (ret) - panic("Cannot create dquot stat counters"); - } + ret = percpu_counter_init_many(dqstats.counter, 0, GFP_KERNEL, + _DQST_DQSTAT_LAST); + if (ret) + panic("Cannot create dquot stat counters"); /* Find power-of-two hlist_heads which can fit into allocation */ nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index c7a1aa3c882b..b45c7edc3225 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } const struct file_operations ramfs_file_operations = { diff --git a/fs/read_write.c b/fs/read_write.c index 2115d1f40bd5..ef6339391351 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -392,7 +392,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); - ret = call_read_iter(filp, &kiocb, &iter); + ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; @@ -494,7 +494,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); - ret = call_write_iter(filp, &kiocb, &iter); + ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; @@ -736,9 +736,9 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) - ret = call_read_iter(filp, &kiocb, iter); + ret = filp->f_op->read_iter(&kiocb, iter); else - ret = call_write_iter(filp, &kiocb, iter); + ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; @@ -799,7 +799,7 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, if (ret < 0) return ret; - ret = call_read_iter(file, iocb, iter); + ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); @@ -860,7 +860,7 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, return ret; kiocb_start_write(iocb); - ret = call_write_iter(file, iocb, iter); + ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) @@ -1667,6 +1667,7 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) return 0; } +EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) diff --git a/fs/reiserfs/README b/fs/reiserfs/README index e2f7a264e3ff..11e9ecf24b63 100644 --- a/fs/reiserfs/README +++ b/fs/reiserfs/README @@ -102,19 +102,9 @@ that start on a node aligned boundary (there are reasons to want to node align files), and he invented and implemented indirect items and unformatted nodes as the solution. -Konstantin Shvachko, with the help of the Russian version of a VC, -tried to put me in a position where I was forced into giving control -of the project to him. (Fortunately, as the person paying the money -for all salaries from my dayjob I owned all copyrights, and you can't -really force takeovers of sole proprietorships.) This was something -curious, because he never really understood the value of our project, -why we should do what we do, or why innovation was possible in -general, but he was sure that he ought to be controlling it. Every -innovation had to be forced past him while he was with us. He added -two years to the time required to complete reiserfs, and was a net -loss for me. Mikhail Gilula was a brilliant innovator who also left -in a destructive way that erased the value of his contributions, and -that he was shown much generosity just makes it more painful. +Konstantin Shvachko was taking part in the early days. + +Mikhail Gilula was a brilliant innovator that has shown much generosity. Grigory Zaigralin was an extremely effective system administrator for our group. diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 1d825459ee6e..c1daedc50f4c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2503,8 +2503,8 @@ out: * start/recovery path as __block_write_full_folio, along with special * code to handle reiserfs tails. */ -static int reiserfs_write_full_folio(struct folio *folio, - struct writeback_control *wbc) +static int reiserfs_write_folio(struct folio *folio, + struct writeback_control *wbc, void *data) { struct inode *inode = folio->mapping->host; unsigned long end_index = inode->i_size >> PAGE_SHIFT; @@ -2721,12 +2721,11 @@ static int reiserfs_read_folio(struct file *f, struct folio *folio) return block_read_full_folio(folio, reiserfs_get_block); } -static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) +static int reiserfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - struct folio *folio = page_folio(page); - struct inode *inode = folio->mapping->host; - reiserfs_wait_on_write_block(inode->i_sb); - return reiserfs_write_full_folio(folio, wbc); + reiserfs_wait_on_write_block(mapping->host->i_sb); + return write_cache_pages(mapping, wbc, reiserfs_write_folio, NULL); } static void reiserfs_truncate_failed_write(struct inode *inode) @@ -3405,7 +3404,7 @@ out: } const struct address_space_operations reiserfs_address_space_operations = { - .writepage = reiserfs_writepage, + .writepages = reiserfs_writepages, .read_folio = reiserfs_read_folio, .readahead = reiserfs_readahead, .release_folio = reiserfs_release_folio, @@ -3415,4 +3414,5 @@ const struct address_space_operations reiserfs_address_space_operations = { .bmap = reiserfs_aop_bmap, .direct_IO = reiserfs_direct_IO, .dirty_folio = reiserfs_dirty_folio, + .migrate_folio = buffer_migrate_folio, }; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index e539ccd39e1e..e477ee0ff35d 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2626,8 +2626,7 @@ static int journal_init_dev(struct super_block *super, MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) - set_blocksize(file_bdev(journal->j_bdev_file), - super->s_blocksize); + set_blocksize(journal->j_bdev_file, super->s_blocksize); return 0; } @@ -2643,7 +2642,7 @@ static int journal_init_dev(struct super_block *super, return result; } - set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize); + set_blocksize(journal->j_bdev_file, super->s_blocksize); reiserfs_info(super, "journal_init_dev: journal device: %pg\n", file_bdev(journal->j_bdev_file)); diff --git a/fs/remap_range.c b/fs/remap_range.c index de07f978ce3e..28246dfc8485 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -99,8 +99,7 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, return 0; } -static int remap_verify_area(struct file *file, loff_t pos, loff_t len, - bool write) +int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; @@ -118,6 +117,7 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, return fsnotify_file_area_perm(file, mask, &pos, len); } +EXPORT_SYMBOL_GPL(remap_verify_area); /* * Ensure that we don't remap a partial EOF block in the middle of something diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 5d4b0fd3a59e..262576573eb5 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2465,7 +2465,8 @@ int cifs_revalidate_mapping(struct inode *inode) { int rc; - unsigned long *flags = &CIFS_I(inode)->flags; + struct cifsInodeInfo *cifs_inode = CIFS_I(inode); + unsigned long *flags = &cifs_inode->flags; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); /* swapfiles are not supposed to be shared */ @@ -2482,6 +2483,7 @@ cifs_revalidate_mapping(struct inode *inode) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE) goto skip_invalidate; + cifs_inode->netfs.zero_point = cifs_inode->netfs.remote_i_size; rc = filemap_invalidate_inode(inode, true, 0, LLONG_MAX); if (rc) { cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n", diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index af97389e983e..36d47ce59631 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -518,7 +518,7 @@ DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class, __entry->xid = xid; __entry->tid = tid; __entry->sesid = sesid; - __assign_str(path, full_path); + __assign_str(path); ), TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s", __entry->xid, __entry->sesid, __entry->tid, @@ -762,7 +762,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class, ), TP_fast_assign( __entry->xid = xid; - __assign_str(func_name, func_name); + __assign_str(func_name); __entry->rc = rc; ), TP_printk("\t%s: xid=%u rc=%d", @@ -815,7 +815,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class, ), TP_fast_assign( __entry->xid = xid; - __assign_str(func_name, func_name); + __assign_str(func_name); ), TP_printk("\t%s: xid=%u", __get_str(func_name), __entry->xid) @@ -852,7 +852,7 @@ DECLARE_EVENT_CLASS(smb3_tcon_class, __entry->xid = xid; __entry->tid = tid; __entry->sesid = sesid; - __assign_str(name, unc_name); + __assign_str(name); __entry->rc = rc; ), TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d", @@ -896,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_open_enter_class, __entry->xid = xid; __entry->tid = tid; __entry->sesid = sesid; - __assign_str(path, full_path); + __assign_str(path); __entry->create_options = create_options; __entry->desired_access = desired_access; ), @@ -1098,7 +1098,7 @@ DECLARE_EVENT_CLASS(smb3_connect_class, __entry->conn_id = conn_id; pss = (struct sockaddr_storage *)__entry->dst_addr; *pss = *dst_addr; - __assign_str(hostname, hostname); + __assign_str(hostname); ), TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc", __entry->conn_id, @@ -1134,7 +1134,7 @@ DECLARE_EVENT_CLASS(smb3_connect_err_class, __entry->rc = rc; pss = (struct sockaddr_storage *)__entry->dst_addr; *pss = *dst_addr; - __assign_str(hostname, hostname); + __assign_str(hostname); ), TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc", __entry->rc, @@ -1166,7 +1166,7 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __assign_str(hostname, hostname); + __assign_str(hostname); ), TP_printk("conn_id=0x%llx server=%s current_mid=%llu", __entry->conn_id, @@ -1255,7 +1255,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __assign_str(hostname, hostname); + __assign_str(hostname); __entry->credits = credits; __entry->credits_to_add = credits_to_add; __entry->in_flight = in_flight; diff --git a/fs/splice.c b/fs/splice.c index 218e24b1ac40..60aed8de21f8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -362,7 +362,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, iov_iter_bvec(&to, ITER_DEST, bv, npages, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; - ret = call_read_iter(in, &kiocb, &to); + ret = in->f_op->read_iter(&kiocb, &to); if (ret > 0) { keep = DIV_ROUND_UP(ret, PAGE_SIZE); @@ -740,7 +740,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); init_sync_kiocb(&kiocb, out); kiocb.ki_pos = sd.pos; - ret = call_write_iter(out, &kiocb, &from); + ret = out->f_op->write_iter(&kiocb, &from); sd.pos = kiocb.ki_pos; if (ret <= 0) break; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e8df6430444b..a8c1e7f9a609 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -375,8 +375,6 @@ void squashfs_fill_page(struct page *page, struct squashfs_cache_entry *buffer, flush_dcache_page(page); if (copied == avail) SetPageUptodate(page); - else - SetPageError(page); } /* Copy data into page cache */ @@ -471,7 +469,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) res = read_blocklist(inode, index, &block); if (res < 0) - goto error_out; + goto out; if (res == 0) res = squashfs_readpage_sparse(page, expected); @@ -483,8 +481,6 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) if (!res) return 0; -error_out: - SetPageError(page); out: pageaddr = kmap_atomic(page); memset(pageaddr, 0, PAGE_SIZE); diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 763a3f7a75f6..2a689ce71de9 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -106,14 +106,13 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, return 0; mark_errored: - /* Decompression failed, mark pages as errored. Target_page is + /* Decompression failed. Target_page is * dealt with by the caller */ for (i = 0; i < pages; i++) { if (page[i] == NULL || page[i] == target_page) continue; flush_dcache_page(page[i]); - SetPageError(page[i]); unlock_page(page[i]); put_page(page[i]); } diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 11e4539b9eae..65aae7e2a859 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -62,27 +62,21 @@ */ static int get_dir_index_using_name(struct super_block *sb, u64 *next_block, int *next_offset, u64 index_start, - int index_offset, int i_count, const char *name, - int len) + int index_offset, int i_count, const char *name) { struct squashfs_sb_info *msblk = sb->s_fs_info; int i, length = 0, err; unsigned int size; struct squashfs_dir_index *index; - char *str; TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count); - index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL); + index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); if (index == NULL) { ERROR("Failed to allocate squashfs_dir_index\n"); goto out; } - str = &index->name[SQUASHFS_NAME_LEN + 1]; - strncpy(str, name, len); - str[len] = '\0'; - for (i = 0; i < i_count; i++) { err = squashfs_read_metadata(sb, index, &index_start, &index_offset, sizeof(*index)); @@ -101,7 +95,7 @@ static int get_dir_index_using_name(struct super_block *sb, index->name[size] = '\0'; - if (strcmp(index->name, str) > 0) + if (strcmp(index->name, name) > 0) break; length = le32_to_cpu(index->index); @@ -153,7 +147,7 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, length = get_dir_index_using_name(dir->i_sb, &block, &offset, squashfs_i(dir)->dir_idx_start, squashfs_i(dir)->dir_idx_offset, - squashfs_i(dir)->dir_idx_cnt, name, len); + squashfs_i(dir)->dir_idx_cnt, name); while (length < i_size_read(dir)) { /* diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 2bf977a52c2c..6ef735bd841a 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -32,20 +32,19 @@ static int squashfs_symlink_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; - int index = page->index << PAGE_SHIFT; + int index = folio_pos(folio); u64 block = squashfs_i(inode)->start; int offset = squashfs_i(inode)->offset; int length = min_t(int, i_size_read(inode) - index, PAGE_SIZE); - int bytes, copied; + int bytes, copied, error; void *pageaddr; struct squashfs_cache_entry *entry; TRACE("Entered squashfs_symlink_readpage, page index %ld, start block " - "%llx, offset %x\n", page->index, block, offset); + "%llx, offset %x\n", folio->index, block, offset); /* * Skip index bytes into symlink metadata. @@ -57,14 +56,15 @@ static int squashfs_symlink_read_folio(struct file *file, struct folio *folio) ERROR("Unable to read symlink [%llx:%x]\n", squashfs_i(inode)->start, squashfs_i(inode)->offset); - goto error_out; + error = bytes; + goto out; } } /* * Read length bytes from symlink metadata. Squashfs_read_metadata * is not used here because it can sleep and we want to use - * kmap_atomic to map the page. Instead call the underlying + * kmap_local to map the folio. Instead call the underlying * squashfs_cache_get routine. As length bytes may overlap metadata * blocks, we may need to call squashfs_cache_get multiple times. */ @@ -75,29 +75,26 @@ static int squashfs_symlink_read_folio(struct file *file, struct folio *folio) squashfs_i(inode)->start, squashfs_i(inode)->offset); squashfs_cache_put(entry); - goto error_out; + error = entry->error; + goto out; } - pageaddr = kmap_atomic(page); + pageaddr = kmap_local_folio(folio, 0); copied = squashfs_copy_data(pageaddr + bytes, entry, offset, length - bytes); if (copied == length - bytes) memset(pageaddr + length, 0, PAGE_SIZE - length); else block = entry->next_index; - kunmap_atomic(pageaddr); + kunmap_local(pageaddr); squashfs_cache_put(entry); } - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - return 0; - -error_out: - SetPageError(page); - unlock_page(page); - return 0; + flush_dcache_folio(folio); + error = 0; +out: + folio_end_read(folio, error == 0); + return error; } diff --git a/fs/super.c b/fs/super.c index 69ce6c600968..b72f1d288e95 100644 --- a/fs/super.c +++ b/fs/super.c @@ -274,6 +274,7 @@ static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); + fsnotify_sb_free(s); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 7cd64021d453..d1995e2d6c94 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -785,3 +785,30 @@ int sysfs_emit_at(char *buf, int at, const char *fmt, ...) return len; } EXPORT_SYMBOL_GPL(sysfs_emit_at); + +/** + * sysfs_bin_attr_simple_read - read callback to simply copy from memory. + * @file: attribute file which is being read. + * @kobj: object to which the attribute belongs. + * @attr: attribute descriptor. + * @buf: destination buffer. + * @off: offset in bytes from which to read. + * @count: maximum number of bytes to read. + * + * Simple ->read() callback for bin_attributes backed by a buffer in memory. + * The @private and @size members in struct bin_attribute must be set to the + * buffer's location and size before the bin_attribute is created in sysfs. + * + * Bounds check for @off and @count is done in sysfs_kf_bin_read(). + * Negative value check for @off is done in vfs_setpos() and default_llseek(). + * + * Returns number of bytes written to @buf. + */ +ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + memcpy(buf, attr->private + off, count); + return count; +} +EXPORT_SYMBOL_GPL(sysfs_bin_attr_simple_read); diff --git a/fs/udf/file.c b/fs/udf/file.c index 0ceac4b5937c..97c59585208c 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -39,7 +39,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); struct address_space *mapping = inode->i_mapping; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); loff_t size; unsigned int end; vm_fault_t ret = VM_FAULT_LOCKED; @@ -48,31 +48,31 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); filemap_invalidate_lock_shared(mapping); - lock_page(page); + folio_lock(folio); size = i_size_read(inode); - if (page->mapping != inode->i_mapping || page_offset(page) >= size) { - unlock_page(page); + if (folio->mapping != inode->i_mapping || folio_pos(folio) >= size) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out_unlock; } /* Space is already allocated for in-ICB file */ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) goto out_dirty; - if (page->index == size >> PAGE_SHIFT) + if (folio->index == size >> PAGE_SHIFT) end = size & ~PAGE_MASK; else end = PAGE_SIZE; - err = __block_write_begin(page, 0, end, udf_get_block); + err = __block_write_begin(&folio->page, 0, end, udf_get_block); if (err) { - unlock_page(page); + folio_unlock(folio); ret = vmf_fs_error(err); goto out_unlock; } - block_commit_write(page, 0, end); + block_commit_write(&folio->page, 0, end); out_dirty: - set_page_dirty(page); - wait_for_stable_page(page); + folio_mark_dirty(folio); + folio_wait_stable(folio); out_unlock: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 2f831a3a91af..2fb21c5ffccf 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -208,19 +208,14 @@ static int udf_writepages(struct address_space *mapping, return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL); } -static void udf_adinicb_readpage(struct page *page) +static void udf_adinicb_read_folio(struct folio *folio) { - struct inode *inode = page->mapping->host; - char *kaddr; + struct inode *inode = folio->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); loff_t isize = i_size_read(inode); - kaddr = kmap_local_page(page); - memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize); - memset(kaddr + isize, 0, PAGE_SIZE - isize); - flush_dcache_page(page); - SetPageUptodate(page); - kunmap_local(kaddr); + folio_fill_tail(folio, 0, iinfo->i_data + iinfo->i_lenEAttr, isize); + folio_mark_uptodate(folio); } static int udf_read_folio(struct file *file, struct folio *folio) @@ -228,7 +223,7 @@ static int udf_read_folio(struct file *file, struct folio *folio) struct udf_inode_info *iinfo = UDF_I(file_inode(file)); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - udf_adinicb_readpage(&folio->page); + udf_adinicb_read_folio(folio); folio_unlock(folio); return 0; } @@ -254,7 +249,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); - struct page *page; + struct folio *folio; int ret; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { @@ -266,12 +261,13 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, } if (WARN_ON_ONCE(pos >= PAGE_SIZE)) return -EIO; - page = grab_cache_page_write_begin(mapping, 0); - if (!page) - return -ENOMEM; - *pagep = page; - if (!PageUptodate(page)) - udf_adinicb_readpage(page); + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *pagep = &folio->page; + if (!folio_test_uptodate(folio)) + udf_adinicb_read_folio(folio); return 0; } @@ -280,17 +276,19 @@ static int udf_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); + struct folio *folio; loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) return generic_write_end(file, mapping, pos, len, copied, page, fsdata); + folio = page_folio(page); last_pos = pos + copied; if (last_pos > inode->i_size) i_size_write(inode, last_pos); - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } @@ -341,7 +339,7 @@ const struct address_space_operations udf_aops = { */ int udf_expand_file_adinicb(struct inode *inode) { - struct page *page; + struct folio *folio; struct udf_inode_info *iinfo = UDF_I(inode); int err; @@ -357,12 +355,13 @@ int udf_expand_file_adinicb(struct inode *inode) return 0; } - page = find_or_create_page(inode->i_mapping, 0, GFP_KERNEL); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(inode->i_mapping, 0, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_KERNEL); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!PageUptodate(page)) - udf_adinicb_readpage(page); + if (!folio_test_uptodate(folio)) + udf_adinicb_read_folio(folio); down_write(&iinfo->i_data_sem); memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); @@ -371,22 +370,22 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - set_page_dirty(page); - unlock_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); up_write(&iinfo->i_data_sem); err = filemap_fdatawrite(inode->i_mapping); if (err) { /* Restore everything back so that we don't lose data... */ - lock_page(page); + folio_lock(folio); down_write(&iinfo->i_data_sem); - memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr, - inode->i_size); - unlock_page(page); + memcpy_from_folio(iinfo->i_data + iinfo->i_lenEAttr, + folio, 0, inode->i_size); + folio_unlock(folio); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } - put_page(page); + folio_put(folio); mark_inode_dirty(inode); return err; diff --git a/fs/udf/super.c b/fs/udf/super.c index 2217f7ed7a49..9381a66c6ce5 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -630,7 +630,7 @@ static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param) if (!uopt->nls_map) { errorf(fc, "iocharset %s not found", param->string); - return -EINVAL;; + return -EINVAL; } } break; @@ -895,7 +895,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) int ret; struct timestamp *ts; - outstr = kmalloc(128, GFP_KERNEL); + outstr = kzalloc(128, GFP_KERNEL); if (!outstr) return -ENOMEM; @@ -921,11 +921,11 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32); if (ret < 0) { - strcpy(UDF_SB(sb)->s_volume_ident, "InvalidName"); + strscpy_pad(UDF_SB(sb)->s_volume_ident, "InvalidName"); pr_warn("incorrect volume identification, setting to " "'InvalidName'\n"); } else { - strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret); + strscpy_pad(UDF_SB(sb)->s_volume_ident, outstr); } udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index f7eaf7b14594..fe03745d09b1 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -99,18 +99,17 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from, static int udf_symlink_filler(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct buffer_head *bh = NULL; unsigned char *symlink; int err = 0; - unsigned char *p = page_address(page); + unsigned char *p = folio_address(folio); struct udf_inode_info *iinfo = UDF_I(inode); /* We don't support symlinks longer than one block */ if (inode->i_size > inode->i_sb->s_blocksize) { err = -ENAMETOOLONG; - goto out_unlock; + goto out; } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { @@ -120,24 +119,15 @@ static int udf_symlink_filler(struct file *file, struct folio *folio) if (!bh) { if (!err) err = -EFSCORRUPTED; - goto out_err; + goto out; } symlink = bh->b_data; } err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE); brelse(bh); - if (err) - goto out_err; - - SetPageUptodate(page); - unlock_page(page); - return 0; - -out_err: - SetPageError(page); -out_unlock: - unlock_page(page); +out: + folio_end_read(folio, err == 0); return err; } @@ -147,12 +137,12 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap, { struct dentry *dentry = path->dentry; struct inode *inode = d_backing_inode(dentry); - struct page *page; + struct folio *folio; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); - page = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = read_mapping_folio(inode->i_mapping, 0, NULL); + if (IS_ERR(folio)) + return PTR_ERR(folio); /* * UDF uses non-trivial encoding of symlinks so i_size does not match * number of characters reported by readlink(2) which apparently some @@ -162,8 +152,8 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap, * let's report the length of string returned by readlink(2) for * st_size. */ - stat->size = strlen(page_address(page)); - put_page(page); + stat->size = strlen(folio_address(folio)); + folio_put(folio); return 0; } diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index 758163af39c2..78ecc633606f 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -46,13 +46,18 @@ udf_disk_stamp_to_time(struct timespec64 *dest, struct timestamp src) dest->tv_sec = mktime64(year, src.month, src.day, src.hour, src.minute, src.second); dest->tv_sec -= offset * 60; - dest->tv_nsec = 1000 * (src.centiseconds * 10000 + - src.hundredsOfMicroseconds * 100 + src.microseconds); + /* * Sanitize nanosecond field since reportedly some filesystems are * recorded with bogus sub-second values. */ - dest->tv_nsec %= NSEC_PER_SEC; + if (src.centiseconds < 100 && src.hundredsOfMicroseconds < 100 && + src.microseconds < 100) { + dest->tv_nsec = 1000 * (src.centiseconds * 10000 + + src.hundredsOfMicroseconds * 100 + src.microseconds); + } else { + dest->tv_nsec = 0; + } } void diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 0e51c0025a16..e309afe2b2bb 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -18,13 +18,13 @@ ifdef REGENERATE_UTF8DATA quiet_cmd_utf8data = GEN $@ cmd_utf8data = $< \ - -a $(srctree)/$(src)/DerivedAge.txt \ - -c $(srctree)/$(src)/DerivedCombiningClass.txt \ - -p $(srctree)/$(src)/DerivedCoreProperties.txt \ - -d $(srctree)/$(src)/UnicodeData.txt \ - -f $(srctree)/$(src)/CaseFolding.txt \ - -n $(srctree)/$(src)/NormalizationCorrections.txt \ - -t $(srctree)/$(src)/NormalizationTest.txt \ + -a $(src)/DerivedAge.txt \ + -c $(src)/DerivedCombiningClass.txt \ + -p $(src)/DerivedCoreProperties.txt \ + -d $(src)/UnicodeData.txt \ + -f $(src)/CaseFolding.txt \ + -n $(src)/NormalizationCorrections.txt \ + -t $(src)/NormalizationTest.txt \ -o $@ $(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 2a564f813314..eee7320ab0b0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -658,7 +658,10 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) struct userfaultfd_fork_ctx *fctx; octx = vma->vm_userfaultfd_ctx.ctx; - if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + if (!octx) + return 0; + + if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 76674ad5833e..c50447548d65 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -4,8 +4,8 @@ # All Rights Reserved. # -ccflags-y += -I $(srctree)/$(src) # needed for trace events -ccflags-y += -I $(srctree)/$(src)/libxfs +ccflags-y += -I $(src) # needed for trace events +ccflags-y += -I $(src)/libxfs obj-$(CONFIG_XFS_FS) += xfs.o @@ -34,6 +34,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_dir2_node.o \ xfs_dir2_sf.o \ xfs_dquot_buf.o \ + xfs_exchmaps.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ xfs_iext_tree.o \ @@ -41,6 +42,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_buf.o \ xfs_log_rlimit.o \ xfs_ag_resv.o \ + xfs_parent.o \ xfs_rmap.o \ xfs_rmap_btree.o \ xfs_refcount.o \ @@ -49,6 +51,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_symlink_remote.o \ xfs_trans_inode.o \ xfs_trans_resv.o \ + xfs_trans_space.o \ xfs_types.o \ ) # xfs_rtbitmap is shared with libxfs @@ -67,6 +70,7 @@ xfs-y += xfs_aops.o \ xfs_dir2_readdir.o \ xfs_discard.o \ xfs_error.o \ + xfs_exchrange.o \ xfs_export.o \ xfs_extent_busy.o \ xfs_file.o \ @@ -74,6 +78,7 @@ xfs-y += xfs_aops.o \ xfs_fsmap.o \ xfs_fsops.o \ xfs_globals.o \ + xfs_handle.o \ xfs_health.o \ xfs_icache.o \ xfs_ioctl.o \ @@ -101,6 +106,7 @@ xfs-y += xfs_log.o \ xfs_buf_item.o \ xfs_buf_item_recover.o \ xfs_dquot_item_recover.o \ + xfs_exchmaps_item.o \ xfs_extfree_item.o \ xfs_attr_item.o \ xfs_icreate_item.o \ @@ -157,11 +163,13 @@ xfs-y += $(addprefix scrub/, \ common.o \ dabtree.o \ dir.o \ + dirtree.o \ fscounters.o \ health.o \ ialloc.o \ inode.o \ iscan.o \ + listxattr.o \ nlinks.o \ parent.o \ readdir.o \ @@ -170,6 +178,7 @@ xfs-y += $(addprefix scrub/, \ scrub.o \ symlink.o \ xfarray.o \ + xfblob.o \ xfile.o \ ) @@ -191,23 +200,32 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ + attr_repair.o \ bmap_repair.o \ cow_repair.o \ + dir_repair.o \ + dirtree_repair.o \ + findparent.o \ fscounters_repair.o \ ialloc_repair.o \ inode_repair.o \ newbt.o \ nlinks_repair.o \ + orphanage.o \ + parent_repair.o \ rcbag_btree.o \ rcbag.o \ reap.o \ refcount_repair.o \ repair.o \ rmap_repair.o \ + symlink_repair.o \ + tempfile.o \ ) xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtbitmap_repair.o \ + rtsummary_repair.o \ ) xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index dc1873f76bff..240e079cb3fb 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -194,7 +194,7 @@ xfs_initialize_perag_data( pag = xfs_perag_get(mp, index); error = xfs_alloc_read_agf(pag, NULL, 0, NULL); if (!error) - error = xfs_ialloc_read_agi(pag, NULL, NULL); + error = xfs_ialloc_read_agi(pag, NULL, 0, NULL); if (error) { xfs_perag_put(pag); return error; @@ -931,7 +931,7 @@ xfs_ag_shrink_space( int error, err2; ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(pag, *tpp, &agibp); + error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp); if (error) return error; @@ -963,9 +963,7 @@ xfs_ag_shrink_space( * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. */ - error = xfs_ag_resv_free(pag); - if (error) - return error; + xfs_ag_resv_free(pag); /* internal log shouldn't also show up in the free space btrees */ error = xfs_alloc_vextent_exact_bno(&args, @@ -1062,7 +1060,7 @@ xfs_ag_extend_space( ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(pag, tp, &bp); + error = xfs_ialloc_read_agi(pag, tp, 0, &bp); if (error) return error; @@ -1119,7 +1117,7 @@ xfs_ag_get_geometry( int error; /* Lock the AG headers. */ - error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); + error = xfs_ialloc_read_agi(pag, NULL, 0, &agi_bp); if (error) return error; error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index da1057bd0e60..216423df939e 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -126,14 +126,13 @@ xfs_ag_resv_needed( } /* Clean out a reservation */ -static int +static void __xfs_ag_resv_free( struct xfs_perag *pag, enum xfs_ag_resv_type type) { struct xfs_ag_resv *resv; xfs_extlen_t oldresv; - int error; trace_xfs_ag_resv_free(pag, type, 0); @@ -149,30 +148,19 @@ __xfs_ag_resv_free( oldresv = resv->ar_orig_reserved; else oldresv = resv->ar_reserved; - error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); + xfs_add_fdblocks(pag->pag_mount, oldresv); resv->ar_reserved = 0; resv->ar_asked = 0; resv->ar_orig_reserved = 0; - - if (error) - trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, - error, _RET_IP_); - return error; } /* Free a per-AG reservation. */ -int +void xfs_ag_resv_free( struct xfs_perag *pag) { - int error; - int err2; - - error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); - err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); - if (err2 && !error) - error = err2; - return error; + __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); + __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); } static int @@ -216,7 +204,7 @@ __xfs_ag_resv_init( if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else - error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); + error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index b74b210008ea..ff20ed93de77 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -6,7 +6,7 @@ #ifndef __XFS_AG_RESV_H__ #define __XFS_AG_RESV_H__ -int xfs_ag_resv_free(struct xfs_perag *pag); +void xfs_ag_resv_free(struct xfs_perag *pag); int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp); bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 9da52e92172a..6cb8b2ddc541 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -79,7 +79,7 @@ xfs_prealloc_blocks( } /* - * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * The number of blocks per AG that we withhold from xfs_dec_fdblocks to * guarantee that we can refill the AGFL prior to allocating space in a nearly * full AG. Although the space described by the free space btrees, the * blocks used by the freesp btrees themselves, and the blocks owned by the @@ -89,7 +89,7 @@ xfs_prealloc_blocks( * until the fs goes down, we subtract this many AG blocks from the incore * fdblocks to ensure user allocation does not overcommit the space the * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to - * withhold space from xfs_mod_fdblocks, so we do not account for that here. + * withhold space from xfs_dec_fdblocks, so we do not account for that here. */ #define XFS_ALLOCBT_AGFL_RESERVE 4 diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 673a4b6d2e8d..430cd3244c14 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -26,6 +26,7 @@ #include "xfs_trace.h" #include "xfs_attr_item.h" #include "xfs_xattr.h" +#include "xfs_parent.h" struct kmem_cache *xfs_attr_intent_cache; @@ -87,6 +88,8 @@ xfs_attr_is_leaf( struct xfs_iext_cursor icur; struct xfs_bmbt_irec imap; + ASSERT(!xfs_need_iread_extents(ifp)); + if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS) return false; @@ -224,11 +227,21 @@ int xfs_attr_get_ilocked( struct xfs_da_args *args) { + int error; + xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; + /* + * The incore attr fork iext tree must be loaded for xfs_attr_is_leaf + * to work correctly. + */ + error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK); + if (error) + return error; + if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); if (xfs_attr_is_leaf(args->dp)) @@ -264,9 +277,11 @@ xfs_attr_get( if (xfs_is_shutdown(args->dp->i_mount)) return -EIO; + if (!args->owner) + args->owner = args->dp->i_ino; args->geo = args->dp->i_mount->m_attr_geo; args->whichfork = XFS_ATTR_FORK; - args->hashval = xfs_da_hashname(args->name, args->namelen); + xfs_attr_sethash(args); /* Entirely possible to look up a name which doesn't exist */ args->op_flags = XFS_DA_OP_OKNOENT; @@ -363,7 +378,7 @@ xfs_attr_try_sf_addname( * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). */ - if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) + if (!error) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); if (xfs_has_wsync(dp->i_mount)) @@ -401,6 +416,50 @@ out: return error; } +/* Compute the hash value for a user/root/secure extended attribute */ +xfs_dahash_t +xfs_attr_hashname( + const uint8_t *name, + int namelen) +{ + return xfs_da_hashname(name, namelen); +} + +/* Compute the hash value for any extended attribute from any namespace. */ +xfs_dahash_t +xfs_attr_hashval( + struct xfs_mount *mp, + unsigned int attr_flags, + const uint8_t *name, + int namelen, + const void *value, + int valuelen) +{ + ASSERT(xfs_attr_check_namespace(attr_flags)); + + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_hashattr(mp, name, namelen, value, valuelen); + + return xfs_attr_hashname(name, namelen); +} + +/* + * PPTR_REPLACE operations require the caller to set the old and new names and + * values explicitly. Update the canonical fields to the new name and value + * here now that the removal phase has finished. + */ +static void +xfs_attr_update_pptr_replace_args( + struct xfs_da_args *args) +{ + ASSERT(args->new_namelen > 0); + args->name = args->new_name; + args->namelen = args->new_namelen; + args->value = args->new_value; + args->valuelen = args->new_valuelen; + xfs_attr_sethash(args); +} + /* * Handle the state change on completion of a multi-state attr operation. * @@ -418,14 +477,15 @@ xfs_attr_complete_op( enum xfs_delattr_state replace_state) { struct xfs_da_args *args = attr->xattri_da_args; - bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; + + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + replace_state = XFS_DAS_DONE; + else if (xfs_attr_intent_op(attr) == XFS_ATTRI_OP_FLAGS_PPTR_REPLACE) + xfs_attr_update_pptr_replace_args(args); args->op_flags &= ~XFS_DA_OP_REPLACE; args->attr_filter &= ~XFS_ATTR_INCOMPLETE; - if (do_replace) - return replace_state; - - return XFS_DAS_DONE; + return replace_state; } static int @@ -647,8 +707,8 @@ xfs_attr_leaf_remove_attr( int forkoff; int error; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -679,7 +739,7 @@ xfs_attr_leaf_shrink( if (!xfs_attr_is_leaf(dp)) return 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); if (error) return error; @@ -868,6 +928,11 @@ xfs_attr_lookup( return -ENOATTR; } + /* Prerequisite for xfs_attr_is_leaf */ + error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK); + if (error) + return error; + if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_hasname(args, &bp); @@ -883,74 +948,72 @@ xfs_attr_lookup( return error; } -static void -xfs_attr_defer_add( - struct xfs_da_args *args, - unsigned int op_flags) +int +xfs_attr_add_fork( + struct xfs_inode *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ { + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; /* transaction pointer */ + unsigned int blks; /* space reservation */ + int error; /* error return value */ - struct xfs_attr_intent *new; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - new = kmem_cache_zalloc(xfs_attr_intent_cache, - GFP_KERNEL | __GFP_NOFAIL); - new->xattri_op_flags = op_flags; - new->xattri_da_args = args; + blks = XFS_ADDAFORK_SPACE_RES(mp); - switch (op_flags) { - case XFS_ATTRI_OP_FLAGS_SET: - new->xattri_dela_state = xfs_attr_init_add_state(args); - break; - case XFS_ATTRI_OP_FLAGS_REPLACE: - new->xattri_dela_state = xfs_attr_init_replace_state(args); - break; - case XFS_ATTRI_OP_FLAGS_REMOVE: - new->xattri_dela_state = xfs_attr_init_remove_state(args); - break; - default: - ASSERT(0); - } + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, + rsvd, &tp); + if (error) + return error; + + if (xfs_inode_has_attr_fork(ip)) + goto trans_cancel; + + error = xfs_bmap_add_attrfork(tp, ip, size, rsvd); + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; - xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); - trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); +trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } /* - * Note: If args->value is NULL the attribute will be removed, just like the - * Linux ->setattr API. + * Make a change to the xattr structure. + * + * The caller must have initialized @args, attached dquots, and must not hold + * any ILOCKs. Reserved data blocks may be used if @rsvd is set. + * + * Returns -EEXIST for XFS_ATTRUPDATE_CREATE if the name already exists. + * Returns -ENOATTR for XFS_ATTRUPDATE_REMOVE if the name does not exist. + * Returns 0 on success, or a negative errno if something else went wrong. */ int xfs_attr_set( - struct xfs_da_args *args) + struct xfs_da_args *args, + enum xfs_attr_update op, + bool rsvd) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_trans_res tres; - bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; int rmt_blks = 0; unsigned int total; - if (xfs_is_shutdown(dp->i_mount)) - return -EIO; - - error = xfs_qm_dqattach(dp); - if (error) - return error; - - args->geo = mp->m_attr_geo; - args->whichfork = XFS_ATTR_FORK; - args->hashval = xfs_da_hashname(args->name, args->namelen); + ASSERT(!args->trans); - /* - * We have no control over the attribute names that userspace passes us - * to remove, so we have to allow the name lookup prior to attribute - * removal to fail as well. Preserve the logged flag, since we need - * to pass that through to the logging code. - */ - args->op_flags = XFS_DA_OP_OKNOENT | - (args->op_flags & XFS_DA_OP_LOGGED); - - if (args->value) { + switch (op) { + case XFS_ATTRUPDATE_UPSERT: + case XFS_ATTRUPDATE_CREATE: + case XFS_ATTRUPDATE_REPLACE: XFS_STATS_INC(mp, xs_attr_set); args->total = xfs_attr_calc_size(args, &local); @@ -963,16 +1026,18 @@ xfs_attr_set( xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); - error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + error = xfs_attr_add_fork(dp, sf_size, rsvd); if (error) return error; } if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); - } else { + break; + case XFS_ATTRUPDATE_REMOVE: XFS_STATS_INC(mp, xs_attr_remove); - rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); + rmt_blks = xfs_attr3_max_rmt_blocks(mp); + break; } /* @@ -984,12 +1049,9 @@ xfs_attr_set( if (error) return error; - if (args->value || xfs_inode_hasattr(dp)) { - error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, + if (op != XFS_ATTRUPDATE_REMOVE || xfs_inode_hasattr(dp)) { + error = xfs_iext_count_extend(args->trans, dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(args->trans, dp, - XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; } @@ -997,26 +1059,26 @@ xfs_attr_set( error = xfs_attr_lookup(args); switch (error) { case -EEXIST: - if (!args->value) { + if (op == XFS_ATTRUPDATE_REMOVE) { /* if no value, we are performing a remove operation */ - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_REMOVE); break; } /* Pure create fails if the attr already exists */ - if (args->attr_flags & XATTR_CREATE) + if (op == XFS_ATTRUPDATE_CREATE) goto out_trans_cancel; - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_REPLACE); break; case -ENOATTR: /* Can't remove what isn't there. */ - if (!args->value) + if (op == XFS_ATTRUPDATE_REMOVE) goto out_trans_cancel; /* Pure replace fails if no existing attr to replace. */ - if (args->attr_flags & XATTR_REPLACE) + if (op == XFS_ATTRUPDATE_REPLACE) goto out_trans_cancel; - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_SET); break; default: goto out_trans_cancel; @@ -1029,8 +1091,7 @@ xfs_attr_set( if (xfs_has_wsync(mp)) xfs_trans_set_sync(args->trans); - if (!(args->op_flags & XFS_DA_OP_NOTIME)) - xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. @@ -1039,6 +1100,7 @@ xfs_attr_set( error = xfs_trans_commit(args->trans); out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); + args->trans = NULL; return error; out_trans_cancel: @@ -1051,7 +1113,7 @@ out_trans_cancel: * External routines when attribute list is inside the inode *========================================================================*/ -static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) +int xfs_attr_sf_totsize(struct xfs_inode *dp) { struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; @@ -1154,7 +1216,7 @@ xfs_attr_leaf_try_add( struct xfs_buf *bp; int error; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); if (error) return error; @@ -1202,7 +1264,7 @@ xfs_attr_leaf_hasname( { int error = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, bp); if (error) return error; @@ -1511,12 +1573,23 @@ out_release: return error; } +/* Enforce that there is at most one namespace bit per attr. */ +inline bool xfs_attr_check_namespace(unsigned int attr_flags) +{ + return hweight32(attr_flags & XFS_ATTR_NSP_ONDISK_MASK) < 2; +} + /* Returns true if the attribute entry name is valid. */ bool xfs_attr_namecheck( + unsigned int attr_flags, const void *name, size_t length) { + /* Only one namespace bit allowed. */ + if (!xfs_attr_check_namespace(attr_flags)) + return false; + /* * MAXNAMELEN includes the trailing null, but (name/length) leave it * out, so use >= for the length check. @@ -1524,6 +1597,10 @@ xfs_attr_namecheck( if (length >= MAXNAMELEN) return false; + /* Parent pointers have their own validation. */ + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_namecheck(attr_flags, name, length); + /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 81be9b3e4004..088cb7b30168 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -47,8 +47,9 @@ struct xfs_attrlist_cursor_kern { /* void; state communicated via *context */ -typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, - unsigned char *, int, int); +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *context, + int flags, unsigned char *name, int namelen, void *value, + int valuelen); struct xfs_attr_list_context { struct xfs_trans *tp; @@ -510,8 +511,8 @@ struct xfs_attr_intent { struct xfs_da_args *xattri_da_args; /* - * Shared buffer containing the attr name and value so that the logging - * code can share large memory buffers between log items. + * Shared buffer containing the attr name, new name, and value so that + * the logging code can share large memory buffers between log items. */ struct xfs_attri_log_nameval *xattri_nameval; @@ -529,6 +530,11 @@ struct xfs_attr_intent { struct xfs_bmbt_irec xattri_map; }; +static inline unsigned int +xfs_attr_intent_op(const struct xfs_attr_intent *attr) +{ + return attr->xattri_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; +} /*======================================================================== * Function prototypes for the kernel. @@ -544,10 +550,20 @@ int xfs_inode_hasattr(struct xfs_inode *ip); bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); -int xfs_attr_set(struct xfs_da_args *args); + +enum xfs_attr_update { + XFS_ATTRUPDATE_REMOVE, /* remove attr */ + XFS_ATTRUPDATE_UPSERT, /* set value, replace any existing attr */ + XFS_ATTRUPDATE_CREATE, /* set value, fail if attr already exists */ + XFS_ATTRUPDATE_REPLACE, /* set value, fail if attr does not exist */ +}; + +int xfs_attr_set(struct xfs_da_args *args, enum xfs_attr_update op, bool rsvd); int xfs_attr_set_iter(struct xfs_attr_intent *attr); int xfs_attr_remove_iter(struct xfs_attr_intent *attr); -bool xfs_attr_namecheck(const void *name, size_t length); +bool xfs_attr_check_namespace(unsigned int attr_flags); +bool xfs_attr_namecheck(unsigned int attr_flags, const void *name, + size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, unsigned int *total); @@ -590,7 +606,6 @@ xfs_attr_init_add_state(struct xfs_da_args *args) static inline enum xfs_delattr_state xfs_attr_init_remove_state(struct xfs_da_args *args) { - args->op_flags |= XFS_DA_OP_REMOVE; if (xfs_attr_is_shortform(args->dp)) return XFS_DAS_SF_REMOVE; if (xfs_attr_is_leaf(args->dp)) @@ -614,8 +629,25 @@ xfs_attr_init_replace_state(struct xfs_da_args *args) return xfs_attr_init_add_state(args); } +xfs_dahash_t xfs_attr_hashname(const uint8_t *name, int namelen); + +xfs_dahash_t xfs_attr_hashval(struct xfs_mount *mp, unsigned int attr_flags, + const uint8_t *name, int namelen, const void *value, + int valuelen); + +/* Set the hash value for any extended attribute from any namespace. */ +static inline void xfs_attr_sethash(struct xfs_da_args *args) +{ + args->hashval = xfs_attr_hashval(args->dp->i_mount, args->attr_filter, + args->name, args->namelen, + args->value, args->valuelen); +} + extern struct kmem_cache *xfs_attr_intent_cache; int __init xfs_attr_intent_init_cache(void); void xfs_attr_intent_destroy_cache(void); +int xfs_attr_sf_totsize(struct xfs_inode *dp); +int xfs_attr_add_fork(struct xfs_inode *ip, int size, int rsvd); + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index ac904cc1a97b..b9e98950eb3d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -388,6 +388,27 @@ xfs_attr3_leaf_verify( return NULL; } +xfs_failaddr_t +xfs_attr3_leaf_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_attr3_leafblock *hdr3 = bp->b_addr; + + if (hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.info.owner) != owner) + return __this_address; + } + + return NULL; +} + static void xfs_attr3_leaf_write_verify( struct xfs_buf *bp) @@ -448,16 +469,30 @@ int xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_attr3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); - return err; + return 0; } /*======================================================================== @@ -472,28 +507,57 @@ xfs_attr3_leaf_read( * INCOMPLETE flag will not be set in attr->attr_filter, but rather * XFS_DA_OP_RECOVERY will be set in args->op_flags. */ +static inline unsigned int xfs_attr_match_mask(const struct xfs_da_args *args) +{ + if (args->op_flags & XFS_DA_OP_RECOVERY) + return XFS_ATTR_NSP_ONDISK_MASK; + return XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE; +} + +static inline bool +xfs_attr_parent_match( + const struct xfs_da_args *args, + const void *value, + unsigned int valuelen) +{ + ASSERT(args->value != NULL); + + /* Parent pointers do not use remote values */ + if (!value) + return false; + + /* + * The only value we support is a parent rec. However, we'll accept + * any valuelen so that offline repair can delete ATTR_PARENT values + * that are not parent pointers. + */ + if (valuelen != args->valuelen) + return false; + + return memcmp(args->value, value, valuelen) == 0; +} + static bool xfs_attr_match( struct xfs_da_args *args, - uint8_t namelen, - unsigned char *name, - int flags) + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen) { + unsigned int mask = xfs_attr_match_mask(args); if (args->namelen != namelen) return false; + if ((args->attr_filter & mask) != (attr_flags & mask)) + return false; if (memcmp(args->name, name, namelen) != 0) return false; - /* Recovery ignores the INCOMPLETE flag. */ - if ((args->op_flags & XFS_DA_OP_RECOVERY) && - args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) - return true; + if (attr_flags & XFS_ATTR_PARENT) + return xfs_attr_parent_match(args, value, valuelen); - /* All remaining matches need to be filtered by INCOMPLETE state. */ - if (args->attr_filter != - (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) - return false; return true; } @@ -504,6 +568,13 @@ xfs_attr_copy_value( int valuelen) { /* + * Parent pointer lookups require the caller to specify the name and + * value, so don't copy anything. + */ + if (args->attr_filter & XFS_ATTR_PARENT) + return 0; + + /* * No copy if all we have to do is get the length */ if (!args->valuelen) { @@ -711,8 +782,9 @@ xfs_attr_sf_findname( for (sfe = xfs_attr_sf_firstentry(sf); sfe < xfs_attr_sf_endptr(sf); sfe = xfs_attr_sf_nextentry(sfe)) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) + if (xfs_attr_match(args, sfe->flags, sfe->nameval, + sfe->namelen, &sfe->nameval[sfe->namelen], + sfe->valuelen)) return sfe; } @@ -819,7 +891,8 @@ xfs_attr_sf_removename( */ if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && + !xfs_has_parent(mp)) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); @@ -828,7 +901,8 @@ xfs_attr_sf_removename( ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || - dp->i_df.if_format == XFS_DINODE_FMT_BTREE); + dp->i_df.if_format == XFS_DINODE_FMT_BTREE || + xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -904,6 +978,7 @@ xfs_attr_shortform_to_leaf( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; + nargs.owner = args->owner; sfe = xfs_attr_sf_firstentry(sf); for (i = 0; i < sf->count; i++) { @@ -911,9 +986,13 @@ xfs_attr_shortform_to_leaf( nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; nargs.valuelen = sfe->valuelen; - nargs.hashval = xfs_da_hashname(sfe->nameval, - sfe->namelen); nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; + if (!xfs_attr_check_namespace(sfe->flags)) { + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto out; + } + xfs_attr_sethash(&nargs); error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); @@ -1027,7 +1106,7 @@ xfs_attr_shortform_verify( * one namespace flag per xattr, so we can just count the * bits (i.e. hweight) here. */ - if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) + if (!xfs_attr_check_namespace(sfep->flags)) return __this_address; sfep = next_sfep; @@ -1106,6 +1185,7 @@ xfs_attr3_leaf_to_shortform( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; + nargs.owner = args->owner; for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) @@ -1158,7 +1238,7 @@ xfs_attr3_leaf_to_node( error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, args->owner, 0, &bp1); if (error) goto out; @@ -1237,7 +1317,7 @@ xfs_attr3_leaf_create( ichdr.magic = XFS_ATTR3_LEAF_MAGIC; hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); @@ -1993,7 +2073,7 @@ xfs_attr3_leaf_toosmall( if (blkno == 0) continue; error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, - blkno, &bp); + state->args->owner, blkno, &bp); if (error) return error; @@ -2401,18 +2481,23 @@ xfs_attr3_leaf_lookup_int( */ if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); - if (!xfs_attr_match(args, name_loc->namelen, - name_loc->nameval, entry->flags)) + if (!xfs_attr_match(args, entry->flags, + name_loc->nameval, name_loc->namelen, + &name_loc->nameval[name_loc->namelen], + be16_to_cpu(name_loc->valuelen))) continue; args->index = probe; return -EEXIST; } else { + unsigned int valuelen; + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); - if (!xfs_attr_match(args, name_rmt->namelen, - name_rmt->name, entry->flags)) + valuelen = be32_to_cpu(name_rmt->valuelen); + if (!xfs_attr_match(args, entry->flags, name_rmt->name, + name_rmt->namelen, NULL, valuelen)) continue; args->index = probe; - args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = valuelen; args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, @@ -2715,7 +2800,8 @@ xfs_attr3_leaf_clearflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -2779,7 +2865,8 @@ xfs_attr3_leaf_setflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -2838,7 +2925,8 @@ xfs_attr3_leaf_flipflags( /* * Read the block containing the "old" attr */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp1); if (error) return error; @@ -2846,8 +2934,8 @@ xfs_attr3_leaf_flipflags( * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, - &bp2); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno2, &bp2); if (error) return error; } else { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 9b9948639c0f..bac219589896 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -98,12 +98,14 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp); void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from); +xfs_failaddr_t xfs_attr3_leaf_header_check(struct xfs_buf *bp, + xfs_ino_t owner); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index ff0412828772..4c44ce1c8a64 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -43,19 +43,32 @@ * the logging system and therefore never have a log item. */ -/* - * Each contiguous block has a header, so it is not just a simple attribute - * length to FSB conversion. - */ -int +/* How many bytes can be stored in a remote value buffer? */ +inline unsigned int +xfs_attr3_rmt_buf_space( + struct xfs_mount *mp) +{ + unsigned int blocksize = mp->m_attr_geo->blksize; + + if (xfs_has_crc(mp)) + return blocksize - sizeof(struct xfs_attr3_rmt_hdr); + + return blocksize; +} + +/* Compute number of fsblocks needed to store a remote attr value */ +unsigned int xfs_attr3_rmt_blocks( - struct xfs_mount *mp, - int attrlen) + struct xfs_mount *mp, + unsigned int attrlen) { - if (xfs_has_crc(mp)) { - int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - return (attrlen + buflen - 1) / buflen; - } + /* + * Each contiguous block has a header, so it is not just a simple + * attribute length to FSB conversion. + */ + if (xfs_has_crc(mp)) + return howmany(attrlen, xfs_attr3_rmt_buf_space(mp)); + return XFS_B_TO_FSB(mp, attrlen); } @@ -92,7 +105,6 @@ xfs_attr3_rmt_verify( struct xfs_mount *mp, struct xfs_buf *bp, void *ptr, - int fsbsize, xfs_daddr_t bno) { struct xfs_attr3_rmt_hdr *rmt = ptr; @@ -103,7 +115,7 @@ xfs_attr3_rmt_verify( return __this_address; if (be64_to_cpu(rmt->rm_blkno) != bno) return __this_address; - if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) + if (be32_to_cpu(rmt->rm_bytes) > mp->m_attr_geo->blksize - sizeof(*rmt)) return __this_address; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX) @@ -122,9 +134,9 @@ __xfs_attr3_rmt_read_verify( { struct xfs_mount *mp = bp->b_mount; char *ptr; - int len; + unsigned int len; xfs_daddr_t bno; - int blksize = mp->m_attr_geo->blksize; + unsigned int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) @@ -141,7 +153,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -186,7 +198,7 @@ xfs_attr3_rmt_write_verify( { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - int blksize = mp->m_attr_geo->blksize; + unsigned int blksize = mp->m_attr_geo->blksize; char *ptr; int len; xfs_daddr_t bno; @@ -203,7 +215,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -280,30 +292,30 @@ xfs_attr_rmtval_copyout( struct xfs_mount *mp, struct xfs_buf *bp, struct xfs_inode *dp, - int *offset, - int *valuelen, + xfs_ino_t owner, + unsigned int *offset, + unsigned int *valuelen, uint8_t **dst) { char *src = bp->b_addr; - xfs_ino_t ino = dp->i_ino; xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + unsigned int len = BBTOB(bp->b_length); + unsigned int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { - int hdr_size = 0; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + unsigned int hdr_size = 0; + unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp); byte_cnt = min(*valuelen, byte_cnt); if (xfs_has_crc(mp)) { - if (xfs_attr3_rmt_hdr_ok(src, ino, *offset, + if (xfs_attr3_rmt_hdr_ok(src, owner, *offset, byte_cnt, bno)) { xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", - bno, *offset, byte_cnt, ino); + bno, *offset, byte_cnt, owner); xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -330,20 +342,20 @@ xfs_attr_rmtval_copyin( struct xfs_mount *mp, struct xfs_buf *bp, xfs_ino_t ino, - int *offset, - int *valuelen, + unsigned int *offset, + unsigned int *valuelen, uint8_t **src) { char *dst = bp->b_addr; xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + unsigned int len = BBTOB(bp->b_length); + unsigned int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { - int hdr_size; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + unsigned int hdr_size; + unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp); byte_cnt = min(*valuelen, byte_cnt); hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, @@ -389,12 +401,12 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; uint8_t *dst = args->value; - int valuelen; + unsigned int valuelen; int nmap; int error; - int blkcnt = args->rmtblkcnt; + unsigned int blkcnt = args->rmtblkcnt; int i; - int offset = 0; + unsigned int offset = 0; trace_xfs_attr_rmtval_get(args); @@ -427,8 +439,7 @@ xfs_attr_rmtval_get( return error; error = xfs_attr_rmtval_copyout(mp, bp, args->dp, - &offset, &valuelen, - &dst); + args->owner, &offset, &valuelen, &dst); xfs_buf_relse(bp); if (error) return error; @@ -453,7 +464,7 @@ xfs_attr_rmt_find_hole( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int error; - int blkcnt; + unsigned int blkcnt; xfs_fileoff_t lfileoff = 0; /* @@ -482,11 +493,11 @@ xfs_attr_rmtval_set_value( struct xfs_bmbt_irec map; xfs_dablk_t lblkno; uint8_t *src = args->value; - int blkcnt; - int valuelen; + unsigned int blkcnt; + unsigned int valuelen; int nmap; int error; - int offset = 0; + unsigned int offset = 0; /* * Roll through the "value", copying the attribute value to the @@ -522,8 +533,8 @@ xfs_attr_rmtval_set_value( return error; bp->b_ops = &xfs_attr3_rmt_buf_ops; - xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, - &valuelen, &src); + xfs_attr_rmtval_copyin(mp, bp, args->owner, &offset, &valuelen, + &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); @@ -626,7 +637,6 @@ xfs_attr_rmtval_set_blk( if (error) return error; - ASSERT(nmap == 1); ASSERT((map->br_startblock != DELAYSTARTBLOCK) && (map->br_startblock != HOLESTARTBLOCK)); @@ -646,7 +656,7 @@ xfs_attr_rmtval_invalidate( struct xfs_da_args *args) { xfs_dablk_t lblkno; - int blkcnt; + unsigned int blkcnt; int error; /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index d097ec6c4dc3..e3c6c7d774bf 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -6,7 +6,13 @@ #ifndef __XFS_ATTR_REMOTE_H__ #define __XFS_ATTR_REMOTE_H__ -int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); +unsigned int xfs_attr3_rmt_blocks(struct xfs_mount *mp, unsigned int attrlen); + +/* Number of rmt blocks needed to store the maximally sized attr value */ +static inline unsigned int xfs_attr3_max_rmt_blocks(struct xfs_mount *mp) +{ + return xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); +} int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index bc4422223024..73bdc0e55682 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -16,6 +16,7 @@ typedef struct xfs_attr_sf_sort { uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ xfs_dahash_t hash; /* this entry's hash value */ unsigned char *name; /* name value, pointer into buffer */ + void *value; } xfs_attr_sf_sort_t; #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 656c95a22f2e..3b3206d312d6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -779,7 +779,7 @@ xfs_bmap_local_to_extents_empty( } -STATIC int /* error */ +int /* error */ xfs_bmap_local_to_extents( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ @@ -789,7 +789,8 @@ xfs_bmap_local_to_extents( void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, - struct xfs_ifork *ifp)) + struct xfs_ifork *ifp, void *priv), + void *priv) { int error = 0; int flags; /* logging flags returned */ @@ -850,7 +851,7 @@ xfs_bmap_local_to_extents( * log here. Note that init_fn must also set the buffer log item type * correctly. */ - init_fn(tp, bp, ip, ifp); + init_fn(tp, bp, ip, ifp, priv); /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -976,13 +977,14 @@ xfs_bmap_add_attrfork_local( dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; + dargs.owner = ip->i_ino; return xfs_dir2_sf_to_block(&dargs); } if (S_ISLNK(VFS_I(ip)->i_mode)) return xfs_bmap_local_to_extents(tp, ip, 1, flags, - XFS_DATA_FORK, - xfs_symlink_local_to_remote); + XFS_DATA_FORK, xfs_symlink_local_to_remote, + NULL); /* should only be called for types that support local format data */ ASSERT(0); @@ -1023,40 +1025,29 @@ xfs_bmap_set_attrforkoff( } /* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. + * Convert inode from non-attributed to attributed. Caller must hold the + * ILOCK_EXCL and the file cannot have an attr fork. */ int /* error code */ xfs_bmap_add_attrfork( - xfs_inode_t *ip, /* incore inode pointer */ + struct xfs_trans *tp, + struct xfs_inode *ip, /* incore inode pointer */ int size, /* space new attribute needs */ int rsvd) /* xact may use reserved blks */ { - xfs_mount_t *mp; /* mount structure */ - xfs_trans_t *tp; /* transaction pointer */ - int blks; /* space reservation */ + struct xfs_mount *mp = tp->t_mountp; int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ - ASSERT(xfs_inode_has_attr_fork(ip) == 0); - - mp = ip->i_mount; + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - - blks = XFS_ADDAFORK_SPACE_RES(mp); - - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, - rsvd, &tp); - if (error) - return error; - if (xfs_inode_has_attr_fork(ip)) - goto trans_cancel; + ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) - goto trans_cancel; + return error; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; @@ -1077,7 +1068,7 @@ xfs_bmap_add_attrfork( if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (error) - goto trans_cancel; + return error; if (!xfs_has_attr(mp) || (!xfs_has_attr2(mp) && version == 2)) { bool log_sb = false; @@ -1096,14 +1087,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; - -trans_cancel: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; + return 0; } /* @@ -1586,6 +1570,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -1616,6 +1601,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1650,6 +1636,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -1684,6 +1671,7 @@ xfs_bmap_add_extent_delay_real( goto done; } } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -1722,6 +1710,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING: @@ -1812,6 +1801,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); xfs_iext_next(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); + ASSERT(da_new <= da_old); break; case BMAP_RIGHT_FILLING: @@ -1861,6 +1851,7 @@ xfs_bmap_add_extent_delay_real( PREV.br_blockcount = temp; xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); xfs_iext_next(ifp, &bma->icur); + ASSERT(da_new <= da_old); break; case 0: @@ -1975,7 +1966,7 @@ xfs_bmap_add_extent_delay_real( } if (da_new != da_old) - xfs_mod_delalloc(mp, (int64_t)da_new - da_old); + xfs_mod_delalloc(bma->ip, 0, (int64_t)da_new - da_old); if (bma->cur) { da_new += bma->cur->bc_bmap.allocated; @@ -1983,11 +1974,10 @@ xfs_bmap_add_extent_delay_real( } /* adjust for changes in reserved delayed indirect blocks */ - if (da_new != da_old) { - ASSERT(state == 0 || da_new < da_old); - error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), - false); - } + if (da_new < da_old) + xfs_add_fdblocks(mp, da_old - da_new); + else if (da_new > da_old) + error = xfs_dec_fdblocks(mp, da_new - da_old, true); xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: @@ -2688,12 +2678,12 @@ xfs_bmap_add_extent_hole_delay( } if (oldlen != newlen) { ASSERT(oldlen > newlen); - xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), - false); + xfs_add_fdblocks(ip->i_mount, oldlen - newlen); + /* * Nothing to do for disk quota accounting here. */ - xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen); + xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); } } @@ -3370,7 +3360,7 @@ xfs_bmap_alloc_account( * yet. */ if (ap->wasdel) { - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); return; } @@ -3394,7 +3384,7 @@ xfs_bmap_alloc_account( xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { ap->ip->i_delayed_blks -= ap->length; - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT; } else { fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; @@ -4066,6 +4056,7 @@ xfs_bmapi_reserve_delalloc( struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; + uint64_t fdblocks; int error; xfs_fileoff_t aoff = off; @@ -4108,17 +4099,21 @@ xfs_bmapi_reserve_delalloc( indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); ASSERT(indlen > 0); - error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); - if (error) - goto out_unreserve_quota; + fdblocks = indlen; + if (XFS_IS_REALTIME_INODE(ip)) { + error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen)); + if (error) + goto out_unreserve_quota; + } else { + fdblocks += alen; + } - error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); + error = xfs_dec_fdblocks(mp, fdblocks, false); if (error) - goto out_unreserve_blocks; - + goto out_unreserve_frextents; ip->i_delayed_blks += alen; - xfs_mod_delalloc(ip->i_mount, alen + indlen); + xfs_mod_delalloc(ip, alen, indlen); got->br_startoff = aoff; got->br_startblock = nullstartblock(indlen); @@ -4139,8 +4134,9 @@ xfs_bmapi_reserve_delalloc( return 0; -out_unreserve_blocks: - xfs_mod_fdblocks(mp, alen, false); +out_unreserve_frextents: + if (XFS_IS_REALTIME_INODE(ip)) + xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, alen)); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_quota_unreserve_blkres(ip, alen); @@ -4191,26 +4187,10 @@ xfs_bmapi_allocate( struct xfs_mount *mp = bma->ip->i_mount; int whichfork = xfs_bmapi_whichfork(bma->flags); struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); - int tmp_logflags = 0; int error; ASSERT(bma->length > 0); - - /* - * For the wasdelay case, we could also just allocate the stuff asked - * for in this bmap call but that wouldn't be as good. - */ - if (bma->wasdel) { - bma->length = (xfs_extlen_t)bma->got.br_blockcount; - bma->offset = bma->got.br_startoff; - if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) - bma->prev.br_startoff = NULLFILEOFF; - } else { - bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); - if (!bma->eof) - bma->length = XFS_FILBLKS_MIN(bma->length, - bma->got.br_startoff - bma->offset); - } + ASSERT(bma->length <= XFS_MAX_BMBT_EXTLEN); if (bma->flags & XFS_BMAPI_CONTIG) bma->minlen = bma->length; @@ -4226,8 +4206,15 @@ xfs_bmapi_allocate( } else { error = xfs_bmap_alloc_userdata(bma); } - if (error || bma->blkno == NULLFSBLOCK) + if (error) return error; + if (bma->blkno == NULLFSBLOCK) + return -ENOSPC; + + if (WARN_ON_ONCE(!xfs_valid_startblock(bma->ip, bma->blkno))) { + xfs_bmap_mark_sick(bma->ip, whichfork); + return -EFSCORRUPTED; + } if (bma->flags & XFS_BMAPI_ZERO) { error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); @@ -4260,8 +4247,6 @@ xfs_bmapi_allocate( error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, whichfork, &bma->icur, &bma->cur, &bma->got, &bma->logflags, bma->flags); - - bma->logflags |= tmp_logflags; if (error) return error; @@ -4406,6 +4391,15 @@ xfs_bmapi_finish( * extent state if necessary. Details behaviour is controlled by the flags * parameter. Only allocates blocks from a single allocation group, to avoid * locking problems. + * + * Returns 0 on success and places the extent mappings in mval. nmaps is used + * as an input/output parameter where the caller specifies the maximum number + * of mappings that may be returned and xfs_bmapi_write passes back the number + * of mappings (including existing mappings) it found. + * + * Returns a negative error code on failure, including -ENOSPC when it could not + * allocate any blocks and -ENOSR when it did allocate blocks to convert a + * delalloc range, but those blocks were before the passed in range. */ int xfs_bmapi_write( @@ -4524,20 +4518,33 @@ xfs_bmapi_write( * allocation length request (which can be 64 bits in * length) and the bma length request, which is * xfs_extlen_t and therefore 32 bits. Hence we have to - * check for 32-bit overflows and handle them here. + * be careful and do the min() using the larger type to + * avoid overflows. */ - if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) - bma.length = XFS_MAX_BMBT_EXTLEN; - else - bma.length = len; + bma.length = XFS_FILBLKS_MIN(len, XFS_MAX_BMBT_EXTLEN); + + if (wasdelay) { + bma.length = XFS_FILBLKS_MIN(bma.length, + bma.got.br_blockcount - + (bno - bma.got.br_startoff)); + } else { + if (!eof) + bma.length = XFS_FILBLKS_MIN(bma.length, + bma.got.br_startoff - bno); + } - ASSERT(len > 0); ASSERT(bma.length > 0); error = xfs_bmapi_allocate(&bma); - if (error) + if (error) { + /* + * If we already allocated space in a previous + * iteration return what we go so far when + * running out of space. + */ + if (error == -ENOSPC && bma.nallocs) + break; goto error0; - if (bma.blkno == NULLFSBLOCK) - break; + } /* * If this is a CoW allocation, record the data in @@ -4575,7 +4582,6 @@ xfs_bmapi_write( if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) eof = true; } - *nmap = n; error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -4586,7 +4592,22 @@ xfs_bmapi_write( ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + orig_nmap, n); + + /* + * When converting delayed allocations, xfs_bmapi_allocate ignores + * the passed in bno and always converts from the start of the found + * delalloc extent. + * + * To avoid a successful return with *nmap set to 0, return the magic + * -ENOSR error code for this particular case so that the caller can + * handle it. + */ + if (!n) { + ASSERT(bma.nallocs >= *nmap); + return -ENOSR; + } + *nmap = n; return 0; error0: xfs_bmapi_finish(&bma, whichfork, error); @@ -4599,8 +4620,8 @@ error0: * invocations to allocate the target offset if a large enough physical extent * is not available. */ -int -xfs_bmapi_convert_delalloc( +static int +xfs_bmapi_convert_one_delalloc( struct xfs_inode *ip, int whichfork, xfs_off_t offset, @@ -4630,11 +4651,8 @@ xfs_bmapi_convert_delalloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_iext_count_may_overflow(ip, whichfork, + error = xfs_iext_count_extend(tp, ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; @@ -4657,19 +4675,25 @@ xfs_bmapi_convert_delalloc( if (!isnullstartblock(bma.got.br_startblock)) { xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } bma.tp = tp; bma.ip = ip; bma.wasdel = true; - bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, - XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* + * Always allocate convert from the start of the delalloc extent even if + * that is outside the passed in range to create large contiguous + * extents on disk. + */ + bma.offset = bma.got.br_startoff; + bma.length = bma.got.br_blockcount; + + /* * When we're converting the delalloc reservations backing dirty pages * in the page cache, we must be careful about how we create the new * extents: @@ -4693,22 +4717,14 @@ xfs_bmapi_convert_delalloc( if (error) goto out_finish; - error = -ENOSPC; - if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) - goto out_finish; - if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) { - xfs_bmap_mark_sick(ip, whichfork); - error = -EFSCORRUPTED; - goto out_finish; - } - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); @@ -4731,6 +4747,36 @@ out_trans_cancel: return error; } +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in iomap. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + loff_t offset, + struct iomap *iomap, + unsigned int *seq) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into iomap. Allocate in a loop because it may + * take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset, + iomap, seq); + if (error) + return error; + } while (iomap->offset + iomap->length <= offset); + + return 0; +} + int xfs_bmapi_remap( struct xfs_trans *tp, @@ -4822,32 +4868,18 @@ error0: * ores == 1). The number of stolen blocks is returned. The availability and * subsequent accounting of stolen blocks is the responsibility of the caller. */ -static xfs_filblks_t +static void xfs_bmap_split_indlen( xfs_filblks_t ores, /* original res. */ xfs_filblks_t *indlen1, /* ext1 worst indlen */ - xfs_filblks_t *indlen2, /* ext2 worst indlen */ - xfs_filblks_t avail) /* stealable blocks */ + xfs_filblks_t *indlen2) /* ext2 worst indlen */ { xfs_filblks_t len1 = *indlen1; xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ - xfs_filblks_t stolen = 0; xfs_filblks_t resfactor; /* - * Steal as many blocks as we can to try and satisfy the worst case - * indlen for both new extents. - */ - if (ores < nres && avail) - stolen = XFS_FILBLKS_MIN(nres - ores, avail); - ores += stolen; - - /* nothing else to do if we've satisfied the new reservation */ - if (ores >= nres) - return stolen; - - /* * We can't meet the total required reservation for the two extents. * Calculate the percent of the overall shortage between both extents * and apply this percentage to each of the requested indlen values. @@ -4891,11 +4923,9 @@ xfs_bmap_split_indlen( *indlen1 = len1; *indlen2 = len2; - - return stolen; } -int +void xfs_bmap_del_extent_delay( struct xfs_inode *ip, int whichfork, @@ -4908,9 +4938,9 @@ xfs_bmap_del_extent_delay( struct xfs_bmbt_irec new; int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; - xfs_filblks_t got_indlen, new_indlen, stolen; + xfs_filblks_t got_indlen, new_indlen, stolen = 0; uint32_t state = xfs_bmap_fork_to_state(whichfork); - int error = 0; + uint64_t fdblocks; bool isrt; XFS_STATS_INC(mp, xs_del_exlist); @@ -4925,18 +4955,12 @@ xfs_bmap_del_extent_delay( ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); - if (isrt) - xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); - /* * Update the inode delalloc counter now and wait to update the * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - ASSERT(!isrt); - error = xfs_quota_unreserve_blkres(ip, del->br_blockcount); - if (error) - return error; + xfs_quota_unreserve_blkres(ip, del->br_blockcount); ip->i_delayed_blks -= del->br_blockcount; if (got->br_startoff == del->br_startoff) @@ -4990,8 +5014,24 @@ xfs_bmap_del_extent_delay( new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount); WARN_ON_ONCE(!got_indlen || !new_indlen); - stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen, - del->br_blockcount); + /* + * Steal as many blocks as we can to try and satisfy the worst + * case indlen for both new extents. + * + * However, we can't just steal reservations from the data + * blocks if this is an RT inodes as the data and metadata + * blocks come from different pools. We'll have to live with + * under-filled indirect reservation in this case. + */ + da_new = got_indlen + new_indlen; + if (da_new > da_old && !isrt) { + stolen = XFS_FILBLKS_MIN(da_new - da_old, + del->br_blockcount); + da_old += stolen; + } + if (da_new > da_old) + xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen); + da_new = got_indlen + new_indlen; got->br_startblock = nullstartblock((int)got_indlen); @@ -5003,20 +5043,21 @@ xfs_bmap_del_extent_delay( xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); - da_new = got_indlen + new_indlen - stolen; del->br_blockcount -= stolen; break; } ASSERT(da_old >= da_new); da_diff = da_old - da_new; - if (!isrt) - da_diff += del->br_blockcount; - if (da_diff) { - xfs_mod_fdblocks(mp, da_diff, false); - xfs_mod_delalloc(mp, -da_diff); - } - return error; + fdblocks = da_diff; + + if (isrt) + xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); + else + fdblocks += del->br_blockcount; + + xfs_add_fdblocks(mp, fdblocks); + xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); } void @@ -5107,8 +5148,7 @@ xfs_bmap_del_extent_real( { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ - int do_fx; /* free extent at end of routine */ - int error; /* error return value */ + int error = 0; /* error return value */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -5151,20 +5191,10 @@ xfs_bmap_del_extent_real( return -ENOSPC; *logflagsp = XFS_ILOG_CORE; - if (xfs_ifork_is_realtime(ip, whichfork)) { - if (!(bflags & XFS_BMAPI_REMAP)) { - error = xfs_rtfree_blocks(tp, del->br_startblock, - del->br_blockcount); - if (error) - return error; - } - - do_fx = 0; + if (xfs_ifork_is_realtime(ip, whichfork)) qfield = XFS_TRANS_DQ_RTBCOUNT; - } else { - do_fx = 1; + else qfield = XFS_TRANS_DQ_BCOUNT; - } nblks = del->br_blockcount; del_endblock = del->br_startblock + del->br_blockcount; @@ -5312,18 +5342,29 @@ xfs_bmap_del_extent_real( /* * If we need to, add to list of extents to delete. */ - if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { + if (!(bflags & XFS_BMAPI_REMAP)) { if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); + } else if (xfs_ifork_is_realtime(ip, whichfork)) { + /* + * Ensure the bitmap and summary inodes are locked + * and joined to the transaction before modifying them. + */ + if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { + tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; + xfs_rtbitmap_lock(tp, mp); + } + error = xfs_rtfree_blocks(tp, del->br_startblock, + del->br_blockcount); } else { error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, ((bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN)); - if (error) - return error; } + if (error) + return error; } /* @@ -5414,16 +5455,6 @@ __xfs_bunmapi( } else cur = NULL; - if (isrt) { - /* - * Synchronize by locking the bitmap inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); - } - extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && (nexts == 0 || extno < nexts)) { @@ -5584,18 +5615,16 @@ __xfs_bunmapi( delete: if (wasdel) { - error = xfs_bmap_del_extent_delay(ip, whichfork, &icur, - &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, flags); logflags |= tmp_logflags; + if (error) + goto error0; } - if (error) - goto error0; - end = del.br_startoff - 1; nodelete: /* diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f7662595309d..667b0c2b33d1 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -158,7 +158,7 @@ static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec) * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ -static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_written_extent(const struct xfs_bmbt_irec *irec) { return xfs_bmap_is_real_extent(irec) && irec->br_state != XFS_EXT_UNWRITTEN; @@ -176,9 +176,16 @@ int xfs_bmap_longest_free_extent(struct xfs_perag *pag, void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); -int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +int xfs_bmap_add_attrfork(struct xfs_trans *tp, struct xfs_inode *ip, + int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); +int xfs_bmap_local_to_extents(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t total, int *logflagsp, int whichfork, + void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp, + void *priv), + void *priv); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -195,7 +202,7 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); -int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, +void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 718d071bb21a..16a529a88780 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -252,6 +252,51 @@ xfs_da3_node_verify( return NULL; } +xfs_failaddr_t +xfs_da3_node_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + if (hdr3->hdr.magic != cpu_to_be16(XFS_DA3_NODE_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->owner) != owner) + return __this_address; + } + + return NULL; +} + +xfs_failaddr_t +xfs_da3_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_da_blkinfo *hdr = bp->b_addr; + + if (!xfs_has_crc(mp)) + return NULL; + + switch (hdr->magic) { + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + return xfs_attr3_leaf_header_check(bp, owner); + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + return xfs_da3_node_header_check(bp, owner); + case cpu_to_be16(XFS_DIR3_LEAF1_MAGIC): + case cpu_to_be16(XFS_DIR3_LEAFN_MAGIC): + return xfs_dir3_leaf_header_check(bp, owner); + } + + ASSERT(0); + return NULL; +} + static void xfs_da3_node_write_verify( struct xfs_buf *bp) @@ -486,7 +531,7 @@ xfs_da3_node_create( memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr)); ichdr.magic = XFS_DA3_NODE_MAGIC; hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + hdr3->info.owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { ichdr.magic = XFS_DA_NODE_MAGIC; @@ -1199,6 +1244,7 @@ xfs_da3_root_join( struct xfs_da3_icnode_hdr oldroothdr; int error; struct xfs_inode *dp = state->args->dp; + xfs_failaddr_t fa; trace_xfs_da_root_join(state->args); @@ -1225,6 +1271,13 @@ xfs_da3_root_join( error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* @@ -1259,6 +1312,7 @@ xfs_da3_node_toosmall( struct xfs_da_blkinfo *info; xfs_dablk_t blkno; struct xfs_buf *bp; + xfs_failaddr_t fa; struct xfs_da3_icnode_hdr nodehdr; int count; int forward; @@ -1333,6 +1387,13 @@ xfs_da3_node_toosmall( state->args->whichfork); if (error) return error; + fa = xfs_da3_node_header_check(bp, state->args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(state->args->trans, bp); + xfs_da_mark_sick(state->args); + return -EFSCORRUPTED; + } node = bp->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node); @@ -1591,6 +1652,7 @@ xfs_da3_node_lookup_int( struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; struct xfs_da_args *args; + xfs_failaddr_t fa; xfs_dablk_t blkno; xfs_dahash_t hashval; xfs_dahash_t btreehashval; @@ -1629,6 +1691,12 @@ xfs_da3_node_lookup_int( if (magic == XFS_ATTR_LEAF_MAGIC || magic == XFS_ATTR3_LEAF_MAGIC) { + fa = xfs_attr3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_ATTR_LEAF_MAGIC; blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); break; @@ -1636,6 +1704,12 @@ xfs_da3_node_lookup_int( if (magic == XFS_DIR2_LEAFN_MAGIC || magic == XFS_DIR3_LEAFN_MAGIC) { + fa = xfs_dir3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DIR2_LEAFN_MAGIC; blk->hashval = xfs_dir2_leaf_lasthash(args->dp, blk->bp, NULL); @@ -1648,6 +1722,13 @@ xfs_da3_node_lookup_int( return -EFSCORRUPTED; } + fa = xfs_da3_node_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } + blk->magic = XFS_DA_NODE_MAGIC; /* @@ -1820,6 +1901,7 @@ xfs_da3_blk_link( struct xfs_da_blkinfo *tmp_info; struct xfs_da_args *args; struct xfs_buf *bp; + xfs_failaddr_t fa; int before = 0; int error; struct xfs_inode *dp = state->args->dp; @@ -1863,6 +1945,13 @@ xfs_da3_blk_link( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1884,6 +1973,13 @@ xfs_da3_blk_link( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1913,6 +2009,7 @@ xfs_da3_blk_unlink( struct xfs_da_blkinfo *tmp_info; struct xfs_da_args *args; struct xfs_buf *bp; + xfs_failaddr_t fa; int error; /* @@ -1943,6 +2040,13 @@ xfs_da3_blk_unlink( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1960,6 +2064,13 @@ xfs_da3_blk_unlink( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1996,6 +2107,7 @@ xfs_da3_path_shift( struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; struct xfs_buf *bp; + xfs_failaddr_t fa; xfs_dablk_t blkno = 0; int level; int error; @@ -2074,6 +2186,12 @@ xfs_da3_path_shift( switch (be16_to_cpu(info->magic)) { case XFS_DA_NODE_MAGIC: case XFS_DA3_NODE_MAGIC: + fa = xfs_da3_node_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DA_NODE_MAGIC; xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr); @@ -2087,6 +2205,12 @@ xfs_da3_path_shift( break; case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: + fa = xfs_attr3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; @@ -2094,6 +2218,12 @@ xfs_da3_path_shift( break; case XFS_DIR2_LEAFN_MAGIC: case XFS_DIR3_LEAFN_MAGIC: + fa = xfs_dir3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DIR2_LEAFN_MAGIC; ASSERT(level == path->active-1); blk->index = 0; @@ -2167,8 +2297,8 @@ xfs_da_grow_inode_int( struct xfs_inode *dp = args->dp; int w = args->whichfork; xfs_rfsblock_t nblks = dp->i_nblocks; - struct xfs_bmbt_irec map, *mapp; - int nmap, error, got, i, mapi; + struct xfs_bmbt_irec map, *mapp = ↦ + int nmap, error, got, i, mapi = 1; /* * Find a spot in the file space to put the new block. @@ -2184,14 +2314,7 @@ xfs_da_grow_inode_int( error = xfs_bmapi_write(tp, dp, *bno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->total, &map, &nmap); - if (error) - return error; - - ASSERT(nmap <= 1); - if (nmap == 1) { - mapp = ↦ - mapi = 1; - } else if (nmap == 0 && count > 1) { + if (error == -ENOSPC && count > 1) { xfs_fileoff_t b; int c; @@ -2209,16 +2332,13 @@ xfs_da_grow_inode_int( args->total, &mapp[mapi], &nmap); if (error) goto out_free_map; - if (nmap < 1) - break; mapi += nmap; b = mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount; } - } else { - mapi = 0; - mapp = NULL; } + if (error) + goto out_free_map; /* * Count the blocks we got, make sure it matches the total. @@ -2290,6 +2410,7 @@ xfs_da3_swap_lastblock( struct xfs_buf *last_buf; struct xfs_buf *sib_buf; struct xfs_buf *par_buf; + xfs_failaddr_t fa; xfs_dahash_t dead_hash; xfs_fileoff_t lastoff; xfs_dablk_t dead_blkno; @@ -2326,6 +2447,14 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w); if (error) return error; + fa = xfs_da3_header_check(last_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(last_buf, fa); + xfs_trans_brelse(tp, last_buf); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } + /* * Copy the last block into the dead buffer and log it. */ @@ -2364,6 +2493,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; + fa = xfs_da3_header_check(sib_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(sib_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } sib_info = sib_buf->b_addr; if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->forw) != last_blkno || @@ -2385,6 +2521,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; + fa = xfs_da3_header_check(sib_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(sib_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } sib_info = sib_buf->b_addr; if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->back) != last_blkno || @@ -2408,6 +2551,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; + fa = xfs_da3_node_header_check(par_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(par_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, @@ -2457,6 +2607,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; + fa = xfs_da3_node_header_check(par_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(par_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) { diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 706baf36e175..354d5d65043e 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -54,17 +54,24 @@ enum xfs_dacmp { */ typedef struct xfs_da_args { struct xfs_da_geometry *geo; /* da block geometry */ - const uint8_t *name; /* string (maybe not NULL terminated) */ - int namelen; /* length of string (maybe no NULL) */ - uint8_t filetype; /* filetype of inode for directories */ + const uint8_t *name; /* string (maybe not NULL terminated) */ + const uint8_t *new_name; /* new attr name */ void *value; /* set of bytes (maybe contain NULLs) */ - int valuelen; /* length of value */ - unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ - unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */ - xfs_dahash_t hashval; /* hash value of name */ - xfs_ino_t inumber; /* input/output inode number */ + void *new_value; /* new xattr value (may contain NULLs) */ struct xfs_inode *dp; /* directory inode to manipulate */ struct xfs_trans *trans; /* current trans (changes over time) */ + + xfs_ino_t inumber; /* input/output inode number */ + xfs_ino_t owner; /* inode that owns the dir/attr data */ + + int valuelen; /* length of value */ + int new_valuelen; /* length of new_value */ + uint8_t filetype; /* filetype of inode for directories */ + uint8_t op_flags; /* operation flags */ + uint8_t attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ + short namelen; /* length of string (maybe no NULL) */ + short new_namelen; /* length of new attr name */ + xfs_dahash_t hashval; /* hash value of name */ xfs_extlen_t total; /* total blocks needed, for 1st bmap */ int whichfork; /* data or attribute fork */ xfs_dablk_t blkno; /* blkno of attr leaf of interest */ @@ -77,7 +84,6 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; @@ -89,10 +95,8 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ #define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ -#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ -#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ -#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ -#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */ +#define XFS_DA_OP_RECOVERY (1u << 5) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED (1u << 6) /* Use intent items to track op */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -100,8 +104,6 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_NOTIME, "NOTIME" }, \ - { XFS_DA_OP_REMOVE, "REMOVE" }, \ { XFS_DA_OP_RECOVERY, "RECOVERY" }, \ { XFS_DA_OP_LOGGED, "LOGGED" } @@ -235,6 +237,8 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp, struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from); +xfs_failaddr_t xfs_da3_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_da3_node_header_check(struct xfs_buf *bp, xfs_ino_t owner); extern struct kmem_cache *xfs_da_state_cache; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 060e5c96b70f..86de99e2f757 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -714,12 +714,30 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_PARENT_BIT 3 /* parent pointer attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ #define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) #define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_PARENT (1u << XFS_ATTR_PARENT_BIT) #define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) -#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) + +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_PARENT) + +/* Private attr namespaces not exposed to userspace */ +#define XFS_ATTR_PRIVATE_NSP_MASK (XFS_ATTR_PARENT) + +#define XFS_ATTR_ONDISK_MASK (XFS_ATTR_NSP_ONDISK_MASK | \ + XFS_ATTR_LOCAL | \ + XFS_ATTR_INCOMPLETE) + +#define XFS_ATTR_NAMESPACE_STR \ + { XFS_ATTR_LOCAL, "local" }, \ + { XFS_ATTR_ROOT, "root" }, \ + { XFS_ATTR_SECURE, "secure" }, \ + { XFS_ATTR_PARENT, "parent" } /* * Alignment for namelist and valuelist entries (since they are mixed @@ -862,9 +880,7 @@ struct xfs_attr3_rmt_hdr { #define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) -#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_has_crc((mp)) ? \ - sizeof(struct xfs_attr3_rmt_hdr) : 0)) +unsigned int xfs_attr3_rmt_buf_space(struct xfs_mount *mp); /* Number of bytes in a directory block. */ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) @@ -875,4 +891,17 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, struct xfs_da3_blkinfo *hdr3); +/* + * Parent pointer attribute format definition + * + * The xattr name contains the dirent name. + * The xattr value encodes the parent inode number and generation to ease + * opening parents by handle. + * The xattr hashval is xfs_dir2_namehash() ^ p_ino + */ +struct xfs_parent_rec { + __be64 p_ino; + __be32 p_gen; +} __packed; + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c13276095cc0..4a078e07e1a0 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -27,6 +27,7 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trans_priv.h" +#include "xfs_exchmaps.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -1091,7 +1092,11 @@ xfs_defer_ops_continue( ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); /* Lock the captured resources to the new transaction. */ - if (dfc->dfc_held.dr_inos == 2) + if (dfc->dfc_held.dr_inos > 2) { + xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos); + xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos, + XFS_ILOCK_EXCL); + } else if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) @@ -1176,6 +1181,10 @@ xfs_defer_init_item_caches(void) error = xfs_attr_intent_init_cache(); if (error) goto err; + error = xfs_exchmaps_intent_init_cache(); + if (error) + goto err; + return 0; err: xfs_defer_destroy_item_caches(); @@ -1186,6 +1195,7 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_exchmaps_intent_destroy_cache(); xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 18a9fb92dde8..8b338031e487 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -72,12 +72,18 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; extern const struct xfs_defer_op_type xfs_attr_defer_type; - +extern const struct xfs_defer_op_type xfs_exchmaps_defer_type; /* * Deferred operation item relogging limits. */ -#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ + +/* + * Rename w/ parent pointers can require up to 5 inodes with deferred ops to + * be joined to the transaction: src_dp, target_dp, src_ip, target_ip, and wip. + * These inodes are locked in sorted order by their inode numbers + */ +#define XFS_DEFER_OPS_NR_INODES 5 #define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ /* Resources that must be held across a transaction roll. */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 4821519efad4..457f9a38f850 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -250,11 +250,68 @@ xfs_dir_init( args->geo = dp->i_mount->m_dir_geo; args->dp = dp; args->trans = tp; + args->owner = dp->i_ino; error = xfs_dir2_sf_create(args, pdp->i_ino); kfree(args); return error; } +enum xfs_dir2_fmt +xfs_dir2_format( + struct xfs_da_args *args, + int *error) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_fileoff_t eof; + + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + *error = 0; + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + return XFS_DIR2_FMT_SF; + + *error = xfs_bmap_last_offset(dp, &eof, XFS_DATA_FORK); + if (*error) + return XFS_DIR2_FMT_ERROR; + + if (eof == XFS_B_TO_FSB(mp, geo->blksize)) { + if (XFS_IS_CORRUPT(mp, dp->i_disk_size != geo->blksize)) { + xfs_da_mark_sick(args); + *error = -EFSCORRUPTED; + return XFS_DIR2_FMT_ERROR; + } + return XFS_DIR2_FMT_BLOCK; + } + if (eof == geo->leafblk + geo->fsbcount) + return XFS_DIR2_FMT_LEAF; + return XFS_DIR2_FMT_NODE; +} + +int +xfs_dir_createname_args( + struct xfs_da_args *args) +{ + int error; + + if (!args->inumber) + args->op_flags |= XFS_DA_OP_JUSTCHECK; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_addname(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_addname(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_addname(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_addname(args); + default: + return error; + } +} + /* * Enter a name in a directory, or check for available space. * If inum is 0, only the available space test is performed. @@ -269,7 +326,6 @@ xfs_dir_createname( { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -295,31 +351,9 @@ xfs_dir_createname( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - if (!inum) - args->op_flags |= XFS_DA_OP_JUSTCHECK; - - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_addname(args); - goto out_free; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_addname(args); - goto out_free; - } + args->owner = dp->i_ino; - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_addname(args); - else - rval = xfs_dir2_node_addname(args); - -out_free: + rval = xfs_dir_createname_args(args); kfree(args); return rval; } @@ -350,6 +384,34 @@ xfs_dir_cilookup_result( return -EEXIST; } +int +xfs_dir_lookup_args( + struct xfs_da_args *args) +{ + int error; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + error = xfs_dir2_sf_lookup(args); + break; + case XFS_DIR2_FMT_BLOCK: + error = xfs_dir2_block_lookup(args); + break; + case XFS_DIR2_FMT_LEAF: + error = xfs_dir2_leaf_lookup(args); + break; + case XFS_DIR2_FMT_NODE: + error = xfs_dir2_node_lookup(args); + break; + default: + break; + } + + if (error != -EEXIST) + return error; + return 0; +} + /* * Lookup a name in a directory, give back the inode number. * If ci_name is not NULL, returns the actual name in ci_name if it differs @@ -366,7 +428,6 @@ xfs_dir_lookup( { struct xfs_da_args *args; int rval; - bool v; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -383,34 +444,12 @@ xfs_dir_lookup( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_OKNOENT; + args->owner = dp->i_ino; if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_lookup(args); - goto out_check_rval; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_lookup(args); - goto out_check_rval; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_lookup(args); - else - rval = xfs_dir2_node_lookup(args); - -out_check_rval: - if (rval == -EEXIST) - rval = 0; + rval = xfs_dir_lookup_args(args); if (!rval) { *inum = args->inumber; if (ci_name) { @@ -418,12 +457,31 @@ out_check_rval: ci_name->len = args->valuelen; } } -out_free: xfs_iunlock(dp, lock_mode); kfree(args); return rval; } +int +xfs_dir_removename_args( + struct xfs_da_args *args) +{ + int error; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_removename(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_removename(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_removename(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_removename(args); + default: + return error; + } +} + /* * Remove an entry from a directory. */ @@ -431,13 +489,12 @@ int xfs_dir_removename( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, + const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); @@ -456,30 +513,30 @@ xfs_dir_removename( args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; + args->owner = dp->i_ino; + rval = xfs_dir_removename_args(args); + kfree(args); + return rval; +} - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_removename(args); - goto out_free; - } +int +xfs_dir_replace_args( + struct xfs_da_args *args) +{ + int error; - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_removename(args); - goto out_free; + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_replace(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_replace(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_replace(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_replace(args); + default: + return error; } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_removename(args); - else - rval = xfs_dir2_node_removename(args); -out_free: - kfree(args); - return rval; } /* @@ -495,7 +552,6 @@ xfs_dir_replace( { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -517,28 +573,8 @@ xfs_dir_replace( args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; - - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_replace(args); - goto out_free; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_replace(args); - goto out_free; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_replace(args); - else - rval = xfs_dir2_node_replace(args); -out_free: + args->owner = dp->i_ino; + rval = xfs_dir_replace_args(args); kfree(args); return rval; } @@ -607,57 +643,6 @@ xfs_dir2_grow_inode( } /* - * See if the directory is a single-block form directory. - */ -int -xfs_dir2_isblock( - struct xfs_da_args *args, - bool *isblock) -{ - struct xfs_mount *mp = args->dp->i_mount; - xfs_fileoff_t eof; - int error; - - error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); - if (error) - return error; - - *isblock = false; - if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) - return 0; - - *isblock = true; - if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) { - xfs_da_mark_sick(args); - return -EFSCORRUPTED; - } - return 0; -} - -/* - * See if the directory is a single-leaf form directory. - */ -int -xfs_dir2_isleaf( - struct xfs_da_args *args, - bool *isleaf) -{ - xfs_fileoff_t eof; - int error; - - error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); - if (error) - return error; - - *isleaf = false; - if (eof != args->geo->leafblk + args->geo->fsbcount) - return 0; - - *isleaf = true; - return 0; -} - -/* * Remove the given block from the directory. * This routine is used for data and free blocks, leaf/node are done * by xfs_da_shrink_inode. diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 8497d041f316..6dbe6e9ecb49 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -36,6 +36,16 @@ xfs_dir2_samename( return !memcmp(n1->name, n2->name, n1->len); } +enum xfs_dir2_fmt { + XFS_DIR2_FMT_SF, + XFS_DIR2_FMT_BLOCK, + XFS_DIR2_FMT_LEAF, + XFS_DIR2_FMT_NODE, + XFS_DIR2_FMT_ERROR, +}; + +enum xfs_dir2_fmt xfs_dir2_format(struct xfs_da_args *args, int *error); + /* * Convert inode mode to directory entry filetype */ @@ -58,7 +68,7 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t ino, + const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t inum, @@ -66,6 +76,11 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name); +int xfs_dir_lookup_args(struct xfs_da_args *args); +int xfs_dir_createname_args(struct xfs_da_args *args); +int xfs_dir_removename_args(struct xfs_da_args *args); +int xfs_dir_replace_args(struct xfs_da_args *args); + /* * Direct call from the bmap code, bypassing the generic directory layer. */ @@ -74,8 +89,6 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); @@ -101,6 +114,10 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); +xfs_failaddr_t xfs_dir3_leaf_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_dir3_data_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_dir3_block_header_check(struct xfs_buf *bp, xfs_ino_t owner); + extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index a2da007adb46..0f93ed1a4a74 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -115,17 +115,20 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; -static xfs_failaddr_t +xfs_failaddr_t xfs_dir3_block_header_check( - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (be64_to_cpu(hdr3->owner) != dp->i_ino) + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->owner) != owner) return __this_address; } @@ -136,6 +139,7 @@ int xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; @@ -148,7 +152,7 @@ xfs_dir3_block_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_block_header_check(dp, *bpp); + fa = xfs_dir3_block_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); @@ -163,12 +167,13 @@ xfs_dir3_block_read( static void xfs_dir3_block_init( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_da_args *args, + struct xfs_buf *bp) { - struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; bp->b_ops = &xfs_dir3_block_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); @@ -177,7 +182,7 @@ xfs_dir3_block_init( memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; @@ -382,7 +387,7 @@ xfs_dir2_block_addname( tp = args->trans; /* Read the (one and only) directory block into bp. */ - error = xfs_dir3_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; @@ -697,7 +702,7 @@ xfs_dir2_block_lookup_int( dp = args->dp; tp = args->trans; - error = xfs_dir3_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; @@ -981,7 +986,8 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + args->geo->datablk, 0, &dbp); if (error) return error; } @@ -1009,7 +1015,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - xfs_dir3_block_init(mp, tp, dbp, dp); + xfs_dir3_block_init(args, dbp); needlog = 1; needscan = 0; @@ -1129,7 +1135,7 @@ xfs_dir2_sf_to_block( error = xfs_dir3_data_init(args, blkno, &bp); if (error) goto out_free; - xfs_dir3_block_init(mp, tp, bp, dp); + xfs_dir3_block_init(args, bp); hdr = bp->b_addr; /* @@ -1169,7 +1175,7 @@ xfs_dir2_sf_to_block( * Create entry for . */ dep = bp->b_addr + offset; - dep->inumber = cpu_to_be64(dp->i_ino); + dep->inumber = cpu_to_be64(args->owner); dep->namelen = 1; dep->name[0] = '.'; xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 7a6d965bea71..ea0b9628df18 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -395,17 +395,20 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; -static xfs_failaddr_t +xfs_failaddr_t xfs_dir3_data_header_check( - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; - if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + if (hdr3->hdr.magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.owner) != owner) return __this_address; } @@ -416,6 +419,7 @@ int xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp) @@ -429,7 +433,7 @@ xfs_dir3_data_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_data_header_check(dp, *bpp); + fa = xfs_dir3_data_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); @@ -725,7 +729,7 @@ xfs_dir3_data_init( memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); } else diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 08dda5ce9d91..71c2f22a3f6e 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -208,6 +208,29 @@ xfs_dir3_leaf_verify( return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true); } +xfs_failaddr_t +xfs_dir3_leaf_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_dir3_leaf *hdr3 = bp->b_addr; + + if (hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) && + hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.info.owner) != owner) + return __this_address; + } + + return NULL; +} + static void xfs_dir3_leaf_read_verify( struct xfs_buf *bp) @@ -271,32 +294,60 @@ int xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_dir3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); - return err; + return 0; } int xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_dir3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); - return err; + return 0; } /* @@ -304,12 +355,12 @@ xfs_dir3_leafn_read( */ static void xfs_dir3_leaf_init( - struct xfs_mount *mp, - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, - xfs_ino_t owner, uint16_t type) { + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_trans *tp = args->trans; struct xfs_dir2_leaf *leaf = bp->b_addr; ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); @@ -323,7 +374,7 @@ xfs_dir3_leaf_init( ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - leaf3->info.owner = cpu_to_be64(owner); + leaf3->info.owner = cpu_to_be64(args->owner); uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { memset(leaf, 0, sizeof(*leaf)); @@ -356,7 +407,6 @@ xfs_dir3_leaf_get_buf( { struct xfs_inode *dp = args->dp; struct xfs_trans *tp = args->trans; - struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; int error; @@ -369,7 +419,7 @@ xfs_dir3_leaf_get_buf( if (error) return error; - xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_init(args, bp, magic); xfs_dir3_leaf_log_header(args, bp); if (magic == XFS_DIR2_LEAF1_MAGIC) xfs_dir3_leaf_log_tail(args, bp); @@ -647,7 +697,8 @@ xfs_dir2_leaf_addname( trace_xfs_dir2_leaf_addname(args); - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk, + &lbp); if (error) return error; @@ -834,9 +885,9 @@ xfs_dir2_leaf_addname( * Already had space in some data block. * Just read that one in. */ - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, use_block), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, use_block), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1238,7 +1289,8 @@ xfs_dir2_leaf_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk, + &lbp); if (error) return error; @@ -1276,9 +1328,9 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, newdb), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, newdb), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1318,9 +1370,9 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, cidb), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, cidb), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1614,7 +1666,8 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(geo, db), 0, &dbp); if (error) return error; @@ -1753,7 +1806,8 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); + error = xfs_dir2_free_read(tp, dp, args->owner, args->geo->freeblk, + &fbp); if (error) return error; xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index be0b8834028c..fe8d4fa13128 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -175,11 +175,11 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { /* Everything ok in the free block header? */ static xfs_failaddr_t xfs_dir3_free_header_check( - struct xfs_inode *dp, - xfs_dablk_t fbno, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner, + xfs_dablk_t fbno) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; int maxbests = mp->m_dir_geo->free_max_bests; unsigned int firstdb; @@ -195,7 +195,7 @@ xfs_dir3_free_header_check( return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) return __this_address; - if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + if (be64_to_cpu(hdr3->hdr.owner) != owner) return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; @@ -214,6 +214,7 @@ static int __xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, unsigned int flags, struct xfs_buf **bpp) @@ -227,7 +228,7 @@ __xfs_dir3_free_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_free_header_check(dp, fbno, *bpp); + fa = xfs_dir3_free_header_check(*bpp, owner, fbno); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); @@ -299,20 +300,23 @@ int xfs_dir2_free_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp); + return __xfs_dir3_free_read(tp, dp, owner, fbno, 0, bpp); } static int xfs_dir2_free_try_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp); + return __xfs_dir3_free_read(tp, dp, owner, fbno, XFS_DABUF_MAP_HOLE_OK, + bpp); } static int @@ -349,7 +353,7 @@ xfs_dir3_free_get_buf( hdr.magic = XFS_DIR3_FREE_MAGIC; hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + hdr3->hdr.owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; @@ -717,7 +721,7 @@ xfs_dir2_leafn_lookup_for_addname( if (curbp) xfs_trans_brelse(tp, curbp); - error = xfs_dir2_free_read(tp, dp, + error = xfs_dir2_free_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, newfdb), &curbp); @@ -863,7 +867,7 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_dir3_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, newdb), 0, &curbp); @@ -1356,8 +1360,8 @@ xfs_dir2_leafn_remove( * read in the free block. */ fdb = xfs_dir2_db_to_fdb(geo, db); - error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb), - &fbp); + error = xfs_dir2_free_read(tp, dp, args->owner, + xfs_dir2_db_to_da(geo, fdb), &fbp); if (error) return error; free = fbp->b_addr; @@ -1562,7 +1566,8 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp); + error = xfs_dir3_leafn_read(state->args->trans, dp, + state->args->owner, blkno, &bp); if (error) return error; @@ -1715,7 +1720,7 @@ xfs_dir2_node_add_datablk( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(args->geo, *dbno); - error = xfs_dir2_free_try_read(tp, dp, + error = xfs_dir2_free_try_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) return error; @@ -1862,7 +1867,7 @@ xfs_dir2_node_find_freeblk( * so this might not succeed. This should be really rare, so * there's no reason to avoid it. */ - error = xfs_dir2_free_try_read(tp, dp, + error = xfs_dir2_free_try_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) @@ -1948,9 +1953,8 @@ xfs_dir2_node_addname_int( &freehdr, &findex); } else { /* Read the data block in. */ - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, dbno), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, dbno), 0, &dbp); } if (error) return error; @@ -2302,7 +2306,7 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - error = xfs_dir2_free_try_read(tp, dp, fo, &bp); + error = xfs_dir2_free_try_read(tp, dp, args->owner, fo, &bp); if (error) return error; /* diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 1db2e60ba827..3befb32509fa 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -50,8 +50,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, /* xfs_dir2_block.c */ -extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_buf **bpp); +int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, struct xfs_buf **bpp); extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); @@ -78,7 +78,8 @@ extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags, + struct xfs_buf **bpp); int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, unsigned int flags); @@ -95,9 +96,9 @@ void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp, void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to, struct xfs_dir3_icleaf_hdr *from); int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); @@ -154,8 +155,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args); extern int xfs_dir2_node_replace(struct xfs_da_args *args); extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, int *rvalp); -extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); +int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 01a9e86b3037..7002d7676a78 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -63,7 +63,8 @@ #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 #define XFS_ERRTAG_WB_DELAY_MS 42 #define XFS_ERRTAG_WRITE_DELAY_MS 43 -#define XFS_ERRTAG_MAX 44 +#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 +#define XFS_ERRTAG_MAX 45 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -111,5 +112,6 @@ #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #define XFS_RANDOM_WB_DELAY_MS 3000 #define XFS_RANDOM_WRITE_DELAY_MS 3000 +#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c new file mode 100644 index 000000000000..2021396651de --- /dev/null +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -0,0 +1,1235 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_exchmaps.h" +#include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_health.h" +#include "xfs_exchmaps_item.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr.h" +#include "xfs_dir2_priv.h" +#include "xfs_dir2.h" +#include "xfs_symlink_remote.h" + +struct kmem_cache *xfs_exchmaps_intent_cache; + +/* bmbt mappings adjacent to a pair of records. */ +struct xfs_exchmaps_adjacent { + struct xfs_bmbt_irec left1; + struct xfs_bmbt_irec right1; + struct xfs_bmbt_irec left2; + struct xfs_bmbt_irec right2; +}; + +#define ADJACENT_INIT { \ + .left1 = { .br_startblock = HOLESTARTBLOCK }, \ + .right1 = { .br_startblock = HOLESTARTBLOCK }, \ + .left2 = { .br_startblock = HOLESTARTBLOCK }, \ + .right2 = { .br_startblock = HOLESTARTBLOCK }, \ +} + +/* Information to reset reflink flag / CoW fork state after an exchange. */ + +/* + * If the reflink flag is set on either inode, make sure it has an incore CoW + * fork, since all reflink inodes must have them. If there's a CoW fork and it + * has mappings in it, make sure the inodes are tagged appropriately so that + * speculative preallocations can be GC'd if we run low of space. + */ +static inline void +xfs_exchmaps_ensure_cowfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *cfork; + + if (xfs_is_reflink_inode(ip)) + xfs_ifork_init_cow(ip); + + cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); + if (!cfork) + return; + if (cfork->if_bytes > 0) + xfs_inode_set_cowblocks_tag(ip); + else + xfs_inode_clear_cowblocks_tag(ip); +} + +/* + * Adjust the on-disk inode size upwards if needed so that we never add + * mappings into the file past EOF. This is crucial so that log recovery won't + * get confused by the sudden appearance of post-eof mappings. + */ +STATIC void +xfs_exchmaps_update_size( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fsize_t new_isize) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_fsize_t len; + + if (new_isize < 0) + return; + + len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), + new_isize); + + if (len <= ip->i_disk_size) + return; + + trace_xfs_exchmaps_update_inode_size(ip, len); + + ip->i_disk_size = len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Advance the incore state tracking after exchanging a mapping. */ +static inline void +xmi_advance( + struct xfs_exchmaps_intent *xmi, + const struct xfs_bmbt_irec *irec) +{ + xmi->xmi_startoff1 += irec->br_blockcount; + xmi->xmi_startoff2 += irec->br_blockcount; + xmi->xmi_blockcount -= irec->br_blockcount; +} + +/* Do we still have more mappings to exchange? */ +static inline bool +xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi) +{ + return xmi->xmi_blockcount > 0; +} + +/* Do we have post-operation cleanups to perform? */ +static inline bool +xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi) +{ + return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK | + XFS_EXCHMAPS_CLEAR_INO2_REFLINK | + __XFS_EXCHMAPS_INO2_SHORTFORM); +} + +/* Check all mappings to make sure we can actually exchange them. */ +int +xfs_exchmaps_check_forks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + struct xfs_ifork *ifp1, *ifp2; + int whichfork = xfs_exchmaps_reqfork(req); + + /* No fork? */ + ifp1 = xfs_ifork_ptr(req->ip1, whichfork); + ifp2 = xfs_ifork_ptr(req->ip2, whichfork); + if (!ifp1 || !ifp2) + return -EINVAL; + + /* We don't know how to exchange local format forks. */ + if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || + ifp2->if_format == XFS_DINODE_FMT_LOCAL) + return -EINVAL; + + return 0; +} + +#ifdef CONFIG_XFS_QUOTA +/* Log the actual updates to the quota accounting. */ +static inline void +xfs_exchmaps_update_quota( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int64_t ip1_delta = 0, ip2_delta = 0; + unsigned int qflag; + + qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : + XFS_TRANS_DQ_BCOUNT; + + if (xfs_bmap_is_real_extent(irec1)) { + ip1_delta -= irec1->br_blockcount; + ip2_delta += irec1->br_blockcount; + } + + if (xfs_bmap_is_real_extent(irec2)) { + ip1_delta += irec2->br_blockcount; + ip2_delta -= irec2->br_blockcount; + } + + xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta); + xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta); +} +#else +# define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0) +#endif + +/* Decide if we want to skip this mapping from file1. */ +static inline bool +xfs_exchmaps_can_skip_mapping( + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = xmi->xmi_ip1->i_mount; + + /* Do not skip this mapping if the caller did not tell us to. */ + if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN)) + return false; + + /* Do not skip mapped, written mappings. */ + if (xfs_bmap_is_written_extent(irec)) + return false; + + /* + * The mapping is unwritten or a hole. It cannot be a delalloc + * reservation because we already excluded those. It cannot be an + * unwritten extent with dirty page cache because we flushed the page + * cache. For files where the allocation unit is 1FSB (files on the + * data dev, rt files if the extent size is 1FSB), we can safely + * skip this mapping. + */ + if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1)) + return true; + + /* + * For a realtime file with a multi-fsb allocation unit, the decision + * is trickier because we can only swap full allocation units. + * Unwritten mappings can appear in the middle of an rtx if the rtx is + * partially written, but they can also appear for preallocations. + * + * If the mapping is a hole, skip it entirely. Holes should align with + * rtx boundaries. + */ + if (!xfs_bmap_is_real_extent(irec)) + return true; + + /* + * All mappings below this point are unwritten. + * + * - If the beginning is not aligned to an rtx, trim the end of the + * mapping so that it does not cross an rtx boundary, and swap it. + * + * - If both ends are aligned to an rtx, skip the entire mapping. + */ + if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) { + xfs_fileoff_t new_end; + + new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize); + irec->br_blockcount = min(irec->br_blockcount, + new_end - irec->br_startoff); + return false; + } + if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize)) + return true; + + /* + * All mappings below this point are unwritten, start on an rtx + * boundary, and do not end on an rtx boundary. + * + * - If the mapping is longer than one rtx, trim the end of the mapping + * down to an rtx boundary and skip it. + * + * - The mapping is shorter than one rtx. Swap it. + */ + if (irec->br_blockcount > mp->m_sb.sb_rextsize) { + xfs_fileoff_t new_end; + + new_end = rounddown_64(irec->br_startoff + irec->br_blockcount, + mp->m_sb.sb_rextsize); + irec->br_blockcount = new_end - irec->br_startoff; + return true; + } + + return false; +} + +/* + * Walk forward through the file ranges in @xmi until we find two different + * mappings to exchange. If there is work to do, return the mappings; + * otherwise we've reached the end of the range and xmi_blockcount will be + * zero. + * + * If the walk skips over a pair of mappings to the same storage, save them as + * the left records in @adj (if provided) so that the simulation phase can + * avoid an extra lookup. + */ +static int +xfs_exchmaps_find_mappings( + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2, + struct xfs_exchmaps_adjacent *adj) +{ + int nimaps; + int bmap_flags; + int error; + + bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi)); + + for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) { + /* Read mapping from the first file */ + nimaps = 1; + error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1, + xmi->xmi_blockcount, irec1, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec1->br_startblock == DELAYSTARTBLOCK || + irec1->br_startoff != xmi->xmi_startoff1) { + /* + * We should never get no mapping or a delalloc mapping + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) { + trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1); + continue; + } + + /* Read mapping from the second file */ + nimaps = 1; + error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2, + irec1->br_blockcount, irec2, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec2->br_startblock == DELAYSTARTBLOCK || + irec2->br_startoff != xmi->xmi_startoff2) { + /* + * We should never get no mapping or a delalloc mapping + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + /* + * We can only exchange as many blocks as the smaller of the + * two mapping maps. + */ + irec1->br_blockcount = min(irec1->br_blockcount, + irec2->br_blockcount); + + trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1); + trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2); + + /* We found something to exchange, so return it. */ + if (irec1->br_startblock != irec2->br_startblock) + return 0; + + /* + * Two mappings pointing to the same physical block must not + * have different states; that's filesystem corruption. Move + * on to the next mapping if they're both holes or both point + * to the same physical space extent. + */ + if (irec1->br_state != irec2->br_state) { + xfs_bmap_mark_sick(xmi->xmi_ip1, + xfs_exchmaps_whichfork(xmi)); + xfs_bmap_mark_sick(xmi->xmi_ip2, + xfs_exchmaps_whichfork(xmi)); + return -EFSCORRUPTED; + } + + /* + * Save the mappings if we're estimating work and skipping + * these identical mappings. + */ + if (adj) { + memcpy(&adj->left1, irec1, sizeof(*irec1)); + memcpy(&adj->left2, irec2, sizeof(*irec2)); + } + } + + return 0; +} + +/* Exchange these two mappings. */ +static void +xfs_exchmaps_one_step( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int whichfork = xfs_exchmaps_whichfork(xmi); + + xfs_exchmaps_update_quota(tp, xmi, irec1, irec2); + + /* Remove both mappings. */ + xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1); + xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2); + + /* + * Re-add both mappings. We exchange the file offsets between the two + * maps and add the opposite map, which has the effect of filling the + * logical offsets we just unmapped, but with with the physical mapping + * information exchanged. + */ + swap(irec1->br_startoff, irec2->br_startoff); + xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2); + xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1); + + /* Make sure we're not adding mappings past EOF. */ + if (whichfork == XFS_DATA_FORK) { + xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2, + xmi->xmi_isize1); + xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1, + xmi->xmi_isize2); + } + + /* + * Advance our cursor and exit. The caller (either defer ops or log + * recovery) will log the XMD item, and if *blockcount is nonzero, it + * will log a new XMI item for the remainder and call us back. + */ + xmi_advance(xmi, irec1); +} + +/* Convert inode2's leaf attr fork back to shortform, if possible.. */ +STATIC int +xfs_exchmaps_attr_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_da_args args = { + .dp = xmi->xmi_ip2, + .geo = tp->t_mountp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .trans = tp, + .owner = xmi->xmi_ip2->i_ino, + }; + struct xfs_buf *bp; + int forkoff; + int error; + + if (!xfs_attr_is_leaf(xmi->xmi_ip2)) + return 0; + + error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0, + &bp); + if (error) + return error; + + forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2); + if (forkoff == 0) + return 0; + + return xfs_attr3_leaf_to_shortform(bp, &args, forkoff); +} + +/* Convert inode2's block dir fork back to shortform, if possible.. */ +STATIC int +xfs_exchmaps_dir_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_da_args args = { + .dp = xmi->xmi_ip2, + .geo = tp->t_mountp->m_dir_geo, + .whichfork = XFS_DATA_FORK, + .trans = tp, + .owner = xmi->xmi_ip2->i_ino, + }; + struct xfs_dir2_sf_hdr sfh; + struct xfs_buf *bp; + int size; + int error = 0; + + if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK) + return error; + + error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp); + if (error) + return error; + + size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh); + if (size > xfs_inode_data_fork_size(xmi->xmi_ip2)) + return 0; + + return xfs_dir2_block_to_sf(&args, bp, size, &sfh); +} + +/* Convert inode2's remote symlink target back to shortform, if possible. */ +STATIC int +xfs_exchmaps_link_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_inode *ip = xmi->xmi_ip2; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + char *buf; + int error; + + if (ifp->if_format == XFS_DINODE_FMT_LOCAL || + ip->i_disk_size > xfs_inode_data_fork_size(ip)) + return 0; + + /* Read the current symlink target into a buffer. */ + buf = kmalloc(ip->i_disk_size + 1, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + if (!buf) { + ASSERT(0); + return -ENOMEM; + } + + error = xfs_symlink_remote_read(ip, buf); + if (error) + goto free; + + /* Remove the blocks. */ + error = xfs_symlink_remote_truncate(tp, ip); + if (error) + goto free; + + /* Convert fork to local format and log our changes. */ + xfs_idestroy_fork(ifp); + ifp->if_bytes = 0; + ifp->if_format = XFS_DINODE_FMT_LOCAL; + xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size); + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); +free: + kfree(buf); + return error; +} + +/* Clear the reflink flag after an exchange. */ +static inline void +xfs_exchmaps_clear_reflink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + trace_xfs_reflink_unset_inode_flag(ip); + + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Finish whatever work might come after an exchange operation. */ +static int +xfs_exchmaps_do_postop_work( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) { + int error = 0; + + if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) + error = xfs_exchmaps_attr_to_sf(tp, xmi); + else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode)) + error = xfs_exchmaps_dir_to_sf(tp, xmi); + else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) + error = xfs_exchmaps_link_to_sf(tp, xmi); + xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM; + if (error) + return error; + } + + if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) { + xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1); + xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK; + } + + if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) { + xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2); + xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK; + } + + return 0; +} + +/* Finish one step in a mapping exchange operation, possibly relogging. */ +int +xfs_exchmaps_finish_one( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_bmbt_irec irec1, irec2; + int error; + + if (xmi_has_more_exchange_work(xmi)) { + /* + * If the operation state says that some range of the files + * have not yet been exchanged, look for mappings in that range + * to exchange. If we find some mappings, exchange them. + */ + error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL); + if (error) + return error; + + if (xmi_has_more_exchange_work(xmi)) + xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2); + + /* + * If the caller asked us to exchange the file sizes after the + * exchange and either we just exchanged the last mappings in + * the range or we didn't find anything to exchange, update the + * ondisk file sizes. + */ + if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) && + !xmi_has_more_exchange_work(xmi)) { + xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1; + xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2; + + xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE); + } + } else if (xmi_has_postop_work(xmi)) { + /* + * Now that we're finished with the exchange operation, + * complete the post-op cleanup work. + */ + error = xfs_exchmaps_do_postop_work(tp, xmi); + if (error) + return error; + } + + if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + return -EIO; + + /* If we still have work to do, ask for a new transaction. */ + if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) { + trace_xfs_exchmaps_defer(tp->t_mountp, xmi); + return -EAGAIN; + } + + /* + * If we reach here, we've finished all the exchange work and the post + * operation work. The last thing we need to do before returning to + * the caller is to make sure that COW forks are set up correctly. + */ + if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) { + xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1); + xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2); + } + + return 0; +} + +/* + * Compute the amount of bmbt blocks we should reserve for each file. In the + * worst case, each exchange will fill a hole with a new mapping, which could + * result in a btree split every time we add a new leaf block. + */ +static inline uint64_t +xfs_exchmaps_bmbt_blocks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) * + XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req)); +} + +/* Compute the space we should reserve for the rmap btree expansions. */ +static inline uint64_t +xfs_exchmaps_rmapbt_blocks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + if (XFS_IS_REALTIME_INODE(req->ip1)) + return 0; + + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * + XFS_RMAPADD_SPACE_RES(mp); +} + +/* Estimate the bmbt and rmapbt overhead required to exchange mappings. */ +int +xfs_exchmaps_estimate_overhead( + struct xfs_exchmaps_req *req) +{ + struct xfs_mount *mp = req->ip1->i_mount; + xfs_filblks_t bmbt_blocks; + xfs_filblks_t rmapbt_blocks; + xfs_filblks_t resblks = req->resblks; + + /* + * Compute the number of bmbt and rmapbt blocks we might need to handle + * the estimated number of exchanges. + */ + bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req); + rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req); + + trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks); + + /* Make sure the change in file block count doesn't overflow. */ + if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount)) + return -EFBIG; + if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount)) + return -EFBIG; + + /* + * Add together the number of blocks we need to handle btree growth, + * then add it to the number of blocks we need to reserve to this + * transaction. + */ + if (check_add_overflow(resblks, bmbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, bmbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) + return -ENOSPC; + + /* Can't actually reserve more than UINT_MAX blocks. */ + if (req->resblks > UINT_MAX) + return -ENOSPC; + + req->resblks = resblks; + trace_xfs_exchmaps_final_estimate(req); + return 0; +} + +/* Decide if we can merge two real mappings. */ +static inline bool +xmi_can_merge( + const struct xfs_bmbt_irec *b1, + const struct xfs_bmbt_irec *b2) +{ + /* Don't merge holes. */ + if (b1->br_startblock == HOLESTARTBLOCK || + b2->br_startblock == HOLESTARTBLOCK) + return false; + + /* We don't merge holes. */ + if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) + return false; + + if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && + b1->br_startblock + b1->br_blockcount == b2->br_startblock && + b1->br_state == b2->br_state && + b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + return true; + + return false; +} + +/* + * Decide if we can merge three mappings. Caller must ensure all three + * mappings must not be holes or delalloc reservations. + */ +static inline bool +xmi_can_merge_all( + const struct xfs_bmbt_irec *l, + const struct xfs_bmbt_irec *m, + const struct xfs_bmbt_irec *r) +{ + xfs_filblks_t new_len; + + new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount; + return new_len <= XFS_MAX_BMBT_EXTLEN; +} + +#define CLEFT_CONTIG 0x01 +#define CRIGHT_CONTIG 0x02 +#define CHOLE 0x04 +#define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) + +#define NLEFT_CONTIG 0x10 +#define NRIGHT_CONTIG 0x20 +#define NHOLE 0x40 +#define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) + +/* Estimate the effect of a single exchange on mapping count. */ +static inline int +xmi_delta_nextents_step( + struct xfs_mount *mp, + const struct xfs_bmbt_irec *left, + const struct xfs_bmbt_irec *curr, + const struct xfs_bmbt_irec *new, + const struct xfs_bmbt_irec *right) +{ + bool lhole, rhole, chole, nhole; + unsigned int state = 0; + int ret = 0; + + lhole = left->br_startblock == HOLESTARTBLOCK; + rhole = right->br_startblock == HOLESTARTBLOCK; + chole = curr->br_startblock == HOLESTARTBLOCK; + nhole = new->br_startblock == HOLESTARTBLOCK; + + if (chole) + state |= CHOLE; + if (!lhole && !chole && xmi_can_merge(left, curr)) + state |= CLEFT_CONTIG; + if (!rhole && !chole && xmi_can_merge(curr, right)) + state |= CRIGHT_CONTIG; + if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && + !xmi_can_merge_all(left, curr, right)) + state &= ~CRIGHT_CONTIG; + + if (nhole) + state |= NHOLE; + if (!lhole && !nhole && xmi_can_merge(left, new)) + state |= NLEFT_CONTIG; + if (!rhole && !nhole && xmi_can_merge(new, right)) + state |= NRIGHT_CONTIG; + if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && + !xmi_can_merge_all(left, new, right)) + state &= ~NRIGHT_CONTIG; + + switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { + case CLEFT_CONTIG | CRIGHT_CONTIG: + /* + * left/curr/right are the same mapping, so deleting curr + * causes 2 new mappings to be created. + */ + ret += 2; + break; + case 0: + /* + * curr is not contiguous with any mapping, so we remove curr + * completely + */ + ret--; + break; + case CHOLE: + /* hole, do nothing */ + break; + case CLEFT_CONTIG: + case CRIGHT_CONTIG: + /* trim either left or right, no change */ + break; + } + + switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { + case NLEFT_CONTIG | NRIGHT_CONTIG: + /* + * left/curr/right will become the same mapping, so adding + * curr causes the deletion of right. + */ + ret--; + break; + case 0: + /* new is not contiguous with any mapping */ + ret++; + break; + case NHOLE: + /* hole, do nothing. */ + break; + case NLEFT_CONTIG: + case NRIGHT_CONTIG: + /* new is absorbed into left or right, no change */ + break; + } + + trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret, + state); + return ret; +} + +/* Make sure we don't overflow the extent (mapping) counters. */ +static inline int +xmi_ensure_delta_nextents( + struct xfs_exchmaps_req *req, + struct xfs_inode *ip, + int64_t delta) +{ + struct xfs_mount *mp = ip->i_mount; + int whichfork = xfs_exchmaps_reqfork(req); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + uint64_t new_nextents; + xfs_extnum_t max_nextents; + + if (delta < 0) + return 0; + + /* + * It's always an error if the delta causes integer overflow. delta + * needs an explicit cast here to avoid warnings about implicit casts + * coded into the overflow check. + */ + if (check_add_overflow(ifp->if_nextents, (uint64_t)delta, + &new_nextents)) + return -EFBIG; + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + new_nextents > 10) + return -EFBIG; + + /* + * We always promote both inodes to have large extent counts if the + * superblock feature is enabled, so we only need to check against the + * theoretical maximum. + */ + max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); + if (new_nextents > max_nextents) + return -EFBIG; + + return 0; +} + +/* Find the next mapping after irec. */ +static inline int +xmi_next( + struct xfs_inode *ip, + int bmap_flags, + const struct xfs_bmbt_irec *irec, + struct xfs_bmbt_irec *nrec) +{ + xfs_fileoff_t off; + xfs_filblks_t blockcount; + int nimaps = 1; + int error; + + off = irec->br_startoff + irec->br_blockcount; + blockcount = XFS_MAX_FILEOFF - off; + error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); + if (error) + return error; + if (nrec->br_startblock == DELAYSTARTBLOCK || + nrec->br_startoff != off) { + /* + * If we don't get the mapping we want, return a zero-length + * mapping, which our estimator function will pretend is a hole. + * We shouldn't get delalloc reservations. + */ + nrec->br_startblock = HOLESTARTBLOCK; + } + + return 0; +} + +int __init +xfs_exchmaps_intent_init_cache(void) +{ + xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent", + sizeof(struct xfs_exchmaps_intent), + 0, 0, NULL); + + return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_exchmaps_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_exchmaps_intent_cache); + xfs_exchmaps_intent_cache = NULL; +} + +/* + * Decide if we will exchange the reflink flags between the two files after the + * exchange. The only time we want to do this is if we're exchanging all + * mappings under EOF and the inode reflink flags have different states. + */ +static inline bool +xmi_can_exchange_reflink_flags( + const struct xfs_exchmaps_req *req, + unsigned int reflink_state) +{ + struct xfs_mount *mp = req->ip1->i_mount; + + if (hweight32(reflink_state) != 1) + return false; + if (req->startoff1 != 0 || req->startoff2 != 0) + return false; + if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size)) + return false; + if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) + return false; + return true; +} + + +/* Allocate and initialize a new incore intent item from a request. */ +struct xfs_exchmaps_intent * +xfs_exchmaps_init_intent( + const struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + unsigned int rs = 0; + + xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&xmi->xmi_list); + xmi->xmi_ip1 = req->ip1; + xmi->xmi_ip2 = req->ip2; + xmi->xmi_startoff1 = req->startoff1; + xmi->xmi_startoff2 = req->startoff2; + xmi->xmi_blockcount = req->blockcount; + xmi->xmi_isize1 = xmi->xmi_isize2 = -1; + xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS; + + if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) { + xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; + return xmi; + } + + if (req->flags & XFS_EXCHMAPS_SET_SIZES) { + xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES; + xmi->xmi_isize1 = req->ip2->i_disk_size; + xmi->xmi_isize2 = req->ip1->i_disk_size; + } + + /* Record the state of each inode's reflink flag before the op. */ + if (xfs_is_reflink_inode(req->ip1)) + rs |= 1; + if (xfs_is_reflink_inode(req->ip2)) + rs |= 2; + + /* + * Figure out if we're clearing the reflink flags (which effectively + * exchanges them) after the operation. + */ + if (xmi_can_exchange_reflink_flags(req, rs)) { + if (rs & 1) + xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK; + if (rs & 2) + xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK; + } + + if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) || + S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) + xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; + + return xmi; +} + +/* + * Estimate the number of exchange operations and the number of file blocks + * in each file that will be affected by the exchange operation. + */ +int +xfs_exchmaps_estimate( + struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + struct xfs_bmbt_irec irec1, irec2; + struct xfs_exchmaps_adjacent adj = ADJACENT_INIT; + xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; + int64_t d_nexts1, d_nexts2; + int bmap_flags; + int error; + + ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS)); + + bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req)); + xmi = xfs_exchmaps_init_intent(req); + + /* + * To guard against the possibility of overflowing the extent counters, + * we have to estimate an upper bound on the potential increase in that + * counter. We can split the mapping at each end of the range, and for + * each step of the exchange we can split the mapping that we're + * working on if the mappings do not align. + */ + d_nexts1 = d_nexts2 = 3; + + while (xmi_has_more_exchange_work(xmi)) { + /* + * Walk through the file ranges until we find something to + * exchange. Because we're simulating the exchange, pass in + * adj to capture skipped mappings for correct estimation of + * bmbt record merges. + */ + error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj); + if (error) + goto out_free; + if (!xmi_has_more_exchange_work(xmi)) + break; + + /* Update accounting. */ + if (xfs_bmap_is_real_extent(&irec1)) + ip1_blocks += irec1.br_blockcount; + if (xfs_bmap_is_real_extent(&irec2)) + ip2_blocks += irec2.br_blockcount; + req->nr_exchanges++; + + /* Read the next mappings from both files. */ + error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1); + if (error) + goto out_free; + + error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2); + if (error) + goto out_free; + + /* Update extent count deltas. */ + d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount, + &adj.left1, &irec1, &irec2, &adj.right1); + + d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount, + &adj.left2, &irec2, &irec1, &adj.right2); + + /* Now pretend we exchanged the mappings. */ + if (xmi_can_merge(&adj.left2, &irec1)) + adj.left2.br_blockcount += irec1.br_blockcount; + else + memcpy(&adj.left2, &irec1, sizeof(irec1)); + + if (xmi_can_merge(&adj.left1, &irec2)) + adj.left1.br_blockcount += irec2.br_blockcount; + else + memcpy(&adj.left1, &irec2, sizeof(irec2)); + + xmi_advance(xmi, &irec1); + } + + /* Account for the blocks that are being exchanged. */ + if (XFS_IS_REALTIME_INODE(req->ip1) && + xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) { + req->ip1_rtbcount = ip1_blocks; + req->ip2_rtbcount = ip2_blocks; + } else { + req->ip1_bcount = ip1_blocks; + req->ip2_bcount = ip2_blocks; + } + + /* + * Make sure that both forks have enough slack left in their extent + * counters that the exchange operation will not overflow. + */ + trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2); + if (req->ip1 == req->ip2) { + error = xmi_ensure_delta_nextents(req, req->ip1, + d_nexts1 + d_nexts2); + } else { + error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1); + if (error) + goto out_free; + error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2); + } + if (error) + goto out_free; + + trace_xfs_exchmaps_initial_estimate(req); + error = xfs_exchmaps_estimate_overhead(req); +out_free: + kmem_cache_free(xfs_exchmaps_intent_cache, xmi); + return error; +} + +/* Set the reflink flag before an operation. */ +static inline void +xfs_exchmaps_set_reflink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + trace_xfs_reflink_set_inode_flag(ip); + + ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * If either file has shared blocks and we're exchanging data forks, we must + * flag the other file as having shared blocks so that we get the shared-block + * rmap functions if we need to fix up the rmaps. + */ +void +xfs_exchmaps_ensure_reflink( + struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi) +{ + unsigned int rs = 0; + + if (xfs_is_reflink_inode(xmi->xmi_ip1)) + rs |= 1; + if (xfs_is_reflink_inode(xmi->xmi_ip2)) + rs |= 2; + + if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2)) + xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2); + + if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1)) + xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1); +} + +/* Set the large extent count flag before an operation if needed. */ +static inline void +xfs_exchmaps_ensure_large_extent_counts( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + if (xfs_inode_has_large_extent_counts(ip)) + return; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Widen the extent counter fields of both inodes if necessary. */ +void +xfs_exchmaps_upgrade_extent_counts( + struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi) +{ + if (!xfs_has_large_extent_counts(tp->t_mountp)) + return; + + xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1); + xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2); +} + +/* + * Schedule an exchange a range of mappings from one inode to another. + * + * The use of file mapping exchange log intent items ensures the operation can + * be resumed even if the system goes down. The caller must commit the + * transaction to start the work. + * + * The caller must ensure the inodes must be joined to the transaction and + * ILOCKd; they will still be joined to the transaction at exit. + */ +void +xfs_exchange_mappings( + struct xfs_trans *tp, + const struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + + BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS); + + xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL); + xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL); + ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)); + if (req->flags & XFS_EXCHMAPS_SET_SIZES) + ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK)); + ASSERT(xfs_has_exchange_range(tp->t_mountp)); + + if (req->blockcount == 0) + return; + + xmi = xfs_exchmaps_init_intent(req); + xfs_exchmaps_defer_add(tp, xmi); + xfs_exchmaps_ensure_reflink(tp, xmi); + xfs_exchmaps_upgrade_extent_counts(tp, xmi); +} diff --git a/fs/xfs/libxfs/xfs_exchmaps.h b/fs/xfs/libxfs/xfs_exchmaps.h new file mode 100644 index 000000000000..fa822dff202a --- /dev/null +++ b/fs/xfs/libxfs/xfs_exchmaps.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_EXCHMAPS_H__ +#define __XFS_EXCHMAPS_H__ + +/* In-core deferred operation info about a file mapping exchange request. */ +struct xfs_exchmaps_intent { + /* List of other incore deferred work. */ + struct list_head xmi_list; + + /* Inodes participating in the operation. */ + struct xfs_inode *xmi_ip1; + struct xfs_inode *xmi_ip2; + + /* File offset range information. */ + xfs_fileoff_t xmi_startoff1; + xfs_fileoff_t xmi_startoff2; + xfs_filblks_t xmi_blockcount; + + /* Set these file sizes after the operation, unless negative. */ + xfs_fsize_t xmi_isize1; + xfs_fsize_t xmi_isize2; + + uint64_t xmi_flags; /* XFS_EXCHMAPS_* flags */ +}; + +/* Try to convert inode2 from block to short format at the end, if possible. */ +#define __XFS_EXCHMAPS_INO2_SHORTFORM (1ULL << 63) + +#define XFS_EXCHMAPS_INTERNAL_FLAGS (__XFS_EXCHMAPS_INO2_SHORTFORM) + +/* flags that can be passed to xfs_exchmaps_{estimate,mappings} */ +#define XFS_EXCHMAPS_PARAMS (XFS_EXCHMAPS_ATTR_FORK | \ + XFS_EXCHMAPS_SET_SIZES | \ + XFS_EXCHMAPS_INO1_WRITTEN) + +static inline int +xfs_exchmaps_whichfork(const struct xfs_exchmaps_intent *xmi) +{ + if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; +} + +/* Parameters for a mapping exchange request. */ +struct xfs_exchmaps_req { + /* Inodes participating in the operation. */ + struct xfs_inode *ip1; + struct xfs_inode *ip2; + + /* File offset range information. */ + xfs_fileoff_t startoff1; + xfs_fileoff_t startoff2; + xfs_filblks_t blockcount; + + /* XFS_EXCHMAPS_* operation flags */ + uint64_t flags; + + /* + * Fields below this line are filled out by xfs_exchmaps_estimate; + * callers should initialize this part of the struct to zero. + */ + + /* + * Data device blocks to be moved out of ip1, and free space needed to + * handle the bmbt changes. + */ + xfs_filblks_t ip1_bcount; + + /* + * Data device blocks to be moved out of ip2, and free space needed to + * handle the bmbt changes. + */ + xfs_filblks_t ip2_bcount; + + /* rt blocks to be moved out of ip1. */ + xfs_filblks_t ip1_rtbcount; + + /* rt blocks to be moved out of ip2. */ + xfs_filblks_t ip2_rtbcount; + + /* Free space needed to handle the bmbt changes */ + unsigned long long resblks; + + /* Number of exchanges needed to complete the operation */ + unsigned long long nr_exchanges; +}; + +static inline int +xfs_exchmaps_reqfork(const struct xfs_exchmaps_req *req) +{ + if (req->flags & XFS_EXCHMAPS_ATTR_FORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; +} + +int xfs_exchmaps_estimate_overhead(struct xfs_exchmaps_req *req); +int xfs_exchmaps_estimate(struct xfs_exchmaps_req *req); + +extern struct kmem_cache *xfs_exchmaps_intent_cache; + +int __init xfs_exchmaps_intent_init_cache(void); +void xfs_exchmaps_intent_destroy_cache(void); + +struct xfs_exchmaps_intent *xfs_exchmaps_init_intent( + const struct xfs_exchmaps_req *req); +void xfs_exchmaps_ensure_reflink(struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi); +void xfs_exchmaps_upgrade_extent_counts(struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi); + +int xfs_exchmaps_finish_one(struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi); + +int xfs_exchmaps_check_forks(struct xfs_mount *mp, + const struct xfs_exchmaps_req *req); + +void xfs_exchange_mappings(struct xfs_trans *tp, + const struct xfs_exchmaps_req *req); + +#endif /* __XFS_EXCHMAPS_H__ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 2b2f9050fbfb..61f51becff4f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -367,19 +367,23 @@ xfs_sb_has_ro_compat_feature( return (sbp->sb_features_ro_compat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ -#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ -#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ -#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ -#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ -#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ +#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ +#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ +#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ +#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ +#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE| \ - XFS_SB_FEAT_INCOMPAT_SPINODES| \ - XFS_SB_FEAT_INCOMPAT_META_UUID| \ - XFS_SB_FEAT_INCOMPAT_BIGTIME| \ - XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ - XFS_SB_FEAT_INCOMPAT_NREXT64) + (XFS_SB_FEAT_INCOMPAT_FTYPE | \ + XFS_SB_FEAT_INCOMPAT_SPINODES | \ + XFS_SB_FEAT_INCOMPAT_META_UUID | \ + XFS_SB_FEAT_INCOMPAT_BIGTIME | \ + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \ + XFS_SB_FEAT_INCOMPAT_NREXT64 | \ + XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ + XFS_SB_FEAT_INCOMPAT_PARENT) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -898,6 +902,12 @@ static inline uint xfs_dinode_size(int version) #define XFS_MAXLINK ((1U << 31) - 1U) /* + * Any file that hits the maximum ondisk link count should be pinned to avoid + * a use-after-free situation. + */ +#define XFS_NLINK_PINNED (~0U) + +/* * Values for di_format * * This enum is used in string mapping in xfs_trace.h; please keep the diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index ca1b17d01437..97996cb79aaa 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -239,6 +239,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ #define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ #define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ +#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ +#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ /* * Minimum and maximum sizes need for growth checks. @@ -409,6 +411,7 @@ struct xfs_bulkstat { #define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */ #define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */ #define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */ +#define XFS_BS_SICK_DIRTREE (1 << 8) /* directory tree structure */ /* * Project quota id helpers (previously projid was 16bit only @@ -632,7 +635,9 @@ typedef struct xfs_fsop_attrmulti_handlereq { /* * per machine unique filesystem identifier types. */ -typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ +typedef struct xfs_fsid { + __u32 val[2]; /* file system id type */ +} xfs_fsid_t; typedef struct xfs_fid { __u16 fid_len; /* length of remainder */ @@ -715,9 +720,19 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */ #define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ #define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */ +#define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 28 +#define XFS_SCRUB_TYPE_NR 29 + +/* + * This special type code only applies to the vectored scrub implementation. + * + * If any of the previous scrub vectors recorded runtime errors or have + * sv_flags bits set that match the OFLAG bits in the barrier vector's + * sv_flags, set the barrier's sv_ret to -ECANCELED and return to userspace. + */ +#define XFS_SCRUB_TYPE_BARRIER (0xFFFFFFFF) /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) @@ -763,6 +778,29 @@ struct xfs_scrub_metadata { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED) #define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) +/* Vectored scrub calls to reduce the number of kernel transitions. */ + +struct xfs_scrub_vec { + __u32 sv_type; /* XFS_SCRUB_TYPE_* */ + __u32 sv_flags; /* XFS_SCRUB_FLAGS_* */ + __s32 sv_ret; /* 0 or a negative error code */ + __u32 sv_reserved; /* must be zero */ +}; + +/* Vectored metadata scrub control structure. */ +struct xfs_scrub_vec_head { + __u64 svh_ino; /* inode number. */ + __u32 svh_gen; /* inode generation. */ + __u32 svh_agno; /* ag number. */ + __u32 svh_flags; /* XFS_SCRUB_VEC_FLAGS_* */ + __u16 svh_rest_us; /* wait this much time between vector items */ + __u16 svh_nr; /* number of svh_vectors */ + __u64 svh_reserved; /* must be zero */ + __u64 svh_vectors; /* pointer to buffer of xfs_scrub_vec */ +}; + +#define XFS_SCRUB_VEC_FLAGS_ALL (0) + /* * ioctl limits */ @@ -772,6 +810,118 @@ struct xfs_scrub_metadata { # define XFS_XATTR_LIST_MAX 65536 #endif +/* + * Exchange part of file1 with part of the file that this ioctl that is being + * called against (which we'll call file2). Filesystems must be able to + * restart and complete the operation even after the system goes down. + */ +struct xfs_exchange_range { + __s32 file1_fd; + __u32 pad; /* must be zeroes */ + __u64 file1_offset; /* file1 offset, bytes */ + __u64 file2_offset; /* file2 offset, bytes */ + __u64 length; /* bytes to exchange */ + + __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ +}; + +/* + * Exchange file data all the way to the ends of both files, and then exchange + * the file sizes. This flag can be used to replace a file's contents with a + * different amount of data. length will be ignored. + */ +#define XFS_EXCHANGE_RANGE_TO_EOF (1ULL << 0) + +/* Flush all changes in file data and file metadata to disk before returning. */ +#define XFS_EXCHANGE_RANGE_DSYNC (1ULL << 1) + +/* Dry run; do all the parameter verification but do not change anything. */ +#define XFS_EXCHANGE_RANGE_DRY_RUN (1ULL << 2) + +/* + * Exchange only the parts of the two files where the file allocation units + * mapped to file1's range have been written to. This can accelerate + * scatter-gather atomic writes with a temp file if all writes are aligned to + * the file allocation unit. + */ +#define XFS_EXCHANGE_RANGE_FILE1_WRITTEN (1ULL << 3) + +#define XFS_EXCHANGE_RANGE_ALL_FLAGS (XFS_EXCHANGE_RANGE_TO_EOF | \ + XFS_EXCHANGE_RANGE_DSYNC | \ + XFS_EXCHANGE_RANGE_DRY_RUN | \ + XFS_EXCHANGE_RANGE_FILE1_WRITTEN) + +/* Iterating parent pointers of files. */ + +/* target was the root directory */ +#define XFS_GETPARENTS_OFLAG_ROOT (1U << 0) + +/* Cursor is done iterating pptrs */ +#define XFS_GETPARENTS_OFLAG_DONE (1U << 1) + +#define XFS_GETPARENTS_OFLAGS_ALL (XFS_GETPARENTS_OFLAG_ROOT | \ + XFS_GETPARENTS_OFLAG_DONE) + +#define XFS_GETPARENTS_IFLAGS_ALL (0) + +struct xfs_getparents_rec { + struct xfs_handle gpr_parent; /* Handle to parent */ + __u32 gpr_reclen; /* Length of entire record */ + __u32 gpr_reserved; /* zero */ + char gpr_name[]; /* Null-terminated filename */ +}; + +/* Iterate through this file's directory parent pointers */ +struct xfs_getparents { + /* + * Structure to track progress in iterating the parent pointers. + * Must be initialized to zeroes before the first ioctl call, and + * not touched by callers after that. + */ + struct xfs_attrlist_cursor gp_cursor; + + /* Input flags: XFS_GETPARENTS_IFLAG* */ + __u16 gp_iflags; + + /* Output flags: XFS_GETPARENTS_OFLAG* */ + __u16 gp_oflags; + + /* Size of the gp_buffer in bytes */ + __u32 gp_bufsize; + + /* Must be set to zero */ + __u64 gp_reserved; + + /* Pointer to a buffer in which to place xfs_getparents_rec */ + __u64 gp_buffer; +}; + +static inline struct xfs_getparents_rec * +xfs_getparents_first_rec(struct xfs_getparents *gp) +{ + return (struct xfs_getparents_rec *)(uintptr_t)gp->gp_buffer; +} + +static inline struct xfs_getparents_rec * +xfs_getparents_next_rec(struct xfs_getparents *gp, + struct xfs_getparents_rec *gpr) +{ + void *next = ((void *)gpr + gpr->gpr_reclen); + void *end = (void *)(uintptr_t)(gp->gp_buffer + gp->gp_bufsize); + + if (next >= end) + return NULL; + + return next; +} + +/* Iterate through this file handle's directory parent pointers. */ +struct xfs_getparents_by_handle { + /* Handle to file whose parents we want. */ + struct xfs_handle gph_handle; + + struct xfs_getparents gph_request; +}; /* * ioctl commands that are used by Linux filesystems @@ -808,6 +958,9 @@ struct xfs_scrub_metadata { /* XFS_IOC_GETFSMAP ------ hoisted 59 */ #define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata) #define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry) +#define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents) +#define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle) +#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) /* * ioctl commands that replace IRIX syssgi()'s @@ -843,6 +996,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom) #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) +#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 3c64b5f9bd68..b0edb4288e59 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -95,6 +95,7 @@ struct xfs_da_args; /* Don't propagate sick status to ag health summary during inactivation */ #define XFS_SICK_INO_FORGET (1 << 12) +#define XFS_SICK_INO_DIRTREE (1 << 13) /* directory tree structure */ /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ @@ -125,7 +126,8 @@ struct xfs_da_args; XFS_SICK_INO_DIR | \ XFS_SICK_INO_XATTR | \ XFS_SICK_INO_SYMLINK | \ - XFS_SICK_INO_PARENT) + XFS_SICK_INO_PARENT | \ + XFS_SICK_INO_DIRTREE) #define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \ XFS_SICK_INO_BMBTA_ZAPPED | \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index e5ac3e5430c4..14c81f227c5b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1058,6 +1058,33 @@ xfs_inobt_first_free_inode( } /* + * If this AG has corrupt inodes, check if allocating this inode would fail + * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again + * somewhere else. + */ +static int +xfs_dialloc_check_ino( + struct xfs_perag *pag, + struct xfs_trans *tp, + xfs_ino_t ino) +{ + struct xfs_imap imap; + struct xfs_buf *bp; + int error; + + error = xfs_imap(pag, tp, ino, &imap, 0); + if (error) + return -EAGAIN; + + error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp); + if (error) + return -EAGAIN; + + xfs_trans_brelse(tp, bp); + return 0; +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -1309,6 +1336,13 @@ alloc_inode: ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error0; + } + rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); @@ -1584,6 +1618,12 @@ xfs_dialloc_ag( XFS_INODES_PER_CHUNK) == 0); ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error_cur; + } + /* * Modify or remove the finobt record. */ @@ -1699,7 +1739,7 @@ xfs_dialloc_good_ag( return false; if (!xfs_perag_initialised_agi(pag)) { - error = xfs_ialloc_read_agi(pag, tp, NULL); + error = xfs_ialloc_read_agi(pag, tp, 0, NULL); if (error) return false; } @@ -1768,7 +1808,7 @@ xfs_dialloc_try_ag( * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ - error = xfs_ialloc_read_agi(pag, *tpp, &agbp); + error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp); if (error) return error; @@ -2286,7 +2326,7 @@ xfs_difree( /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2332,7 +2372,7 @@ xfs_imap_lookup( int error; int i; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", @@ -2675,6 +2715,7 @@ int xfs_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + xfs_buf_flags_t flags, struct xfs_buf **agibpp) { struct xfs_mount *mp = pag->pag_mount; @@ -2684,7 +2725,7 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); + XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); if (error) @@ -2704,6 +2745,7 @@ int xfs_ialloc_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + int flags, struct xfs_buf **agibpp) { struct xfs_buf *agibp; @@ -2712,7 +2754,9 @@ xfs_ialloc_read_agi( trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, + (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, + &agibp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index f1412183bb44..b549627e3a61 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -63,10 +63,11 @@ xfs_ialloc_log_agi( struct xfs_buf *bp, /* allocation group header buffer */ uint32_t fields); /* bitmask of fields to log */ -int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, +int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, xfs_buf_flags_t flags, struct xfs_buf **agibpp); int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf **agibpp); + int flags, struct xfs_buf **agibpp); +#define XFS_IALLOC_FLAG_TRYLOCK (1U << 0) /* use trylock for buffer locking */ /* * Lookup a record by ino in the btree given by cur. diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index cc661fca6ff5..42e9fd47f6c7 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -745,7 +745,7 @@ xfs_finobt_count_blocks( struct xfs_btree_cur *cur; int error; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; @@ -768,7 +768,7 @@ xfs_finobt_read_blocks( struct xfs_agi *agi; int error; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d0dcce462bf4..d79002343d0b 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -491,6 +491,14 @@ xfs_dinode_verify( return __this_address; } + if (dip->di_version > 1) { + if (dip->di_onlink) + return __this_address; + } else { + if (dip->di_nlink) + return __this_address; + } + /* don't allow invalid i_size */ di_size = be64_to_cpu(dip->di_size); if (di_size & (1ULL << 63)) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 7d660a973909..9d11ae015909 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -765,53 +765,46 @@ xfs_ifork_verify_local_attr( return 0; } +/* + * Check if the inode fork supports adding nr_to_add more extents. + * + * If it doesn't but we can upgrade it to large extent counters, do the upgrade. + * If we can't upgrade or are already using big counters but still can't fit the + * additional extents, return -EFBIG. + */ int -xfs_iext_count_may_overflow( +xfs_iext_count_extend( + struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, - int nr_to_add) + uint nr_to_add) { + struct xfs_mount *mp = ip->i_mount; + bool has_large = + xfs_inode_has_large_extent_counts(ip); struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - uint64_t max_exts; uint64_t nr_exts; + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + if (whichfork == XFS_COW_FORK) return 0; - max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), - whichfork); - - if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) - max_exts = 10; - + /* no point in upgrading if if_nextents overflows */ nr_exts = ifp->if_nextents + nr_to_add; - if (nr_exts < ifp->if_nextents || nr_exts > max_exts) + if (nr_exts < ifp->if_nextents) return -EFBIG; - return 0; -} - -/* - * Upgrade this inode's extent counter fields to be able to handle a potential - * increase in the extent count by nr_to_add. Normally this is the same - * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. - */ -int -xfs_iext_count_upgrade( - struct xfs_trans *tp, - struct xfs_inode *ip, - uint nr_to_add) -{ - ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); - - if (!xfs_has_large_extent_counts(ip->i_mount) || - xfs_inode_has_large_extent_counts(ip) || - XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + nr_exts > 10) return -EFBIG; - ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - + if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { + if (has_large || !xfs_has_large_extent_counts(mp)) + return -EFBIG; + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index bd53eb951b65..2373d12fd474 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -256,10 +256,8 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip); int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); -int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, - int nr_to_add); -int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, - uint nr_to_add); +int xfs_iext_count_extend(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, uint nr_to_add); bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork); /* returns true if the fork has extents but they are not read in yet. */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 16872972e1e9..3e6682ed656b 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -115,10 +115,13 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_BUD_FORMAT 26 #define XLOG_REG_TYPE_ATTRI_FORMAT 27 #define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_NAME 29 #define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_MAX 30 - +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 /* * Flags to log operation header @@ -243,6 +246,8 @@ typedef struct xfs_trans_header { #define XFS_LI_BUD 0x1245 #define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ #define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ +#define XFS_LI_XMI 0x1248 /* mapping exchange intent */ +#define XFS_LI_XMD 0x1249 /* mapping exchange done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -260,7 +265,9 @@ typedef struct xfs_trans_header { { XFS_LI_BUI, "XFS_LI_BUI" }, \ { XFS_LI_BUD, "XFS_LI_BUD" }, \ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ - { XFS_LI_ATTRD, "XFS_LI_ATTRD" } + { XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \ + { XFS_LI_XMI, "XFS_LI_XMI" }, \ + { XFS_LI_XMD, "XFS_LI_XMD" } /* * Inode Log Item Format definitions. @@ -879,6 +886,61 @@ struct xfs_bud_log_format { }; /* + * XMI/XMD (file mapping exchange) log format definitions + */ + +/* This is the structure used to lay out an mapping exchange log item. */ +struct xfs_xmi_log_format { + uint16_t xmi_type; /* xmi log item type */ + uint16_t xmi_size; /* size of this item */ + uint32_t __pad; /* must be zero */ + uint64_t xmi_id; /* xmi identifier */ + + uint64_t xmi_inode1; /* inumber of first file */ + uint64_t xmi_inode2; /* inumber of second file */ + uint32_t xmi_igen1; /* generation of first file */ + uint32_t xmi_igen2; /* generation of second file */ + uint64_t xmi_startoff1; /* block offset into file1 */ + uint64_t xmi_startoff2; /* block offset into file2 */ + uint64_t xmi_blockcount; /* number of blocks */ + uint64_t xmi_flags; /* XFS_EXCHMAPS_* */ + uint64_t xmi_isize1; /* intended file1 size */ + uint64_t xmi_isize2; /* intended file2 size */ +}; + +/* Exchange mappings between extended attribute forks instead of data forks. */ +#define XFS_EXCHMAPS_ATTR_FORK (1ULL << 0) + +/* Set the file sizes when finished. */ +#define XFS_EXCHMAPS_SET_SIZES (1ULL << 1) + +/* + * Exchange the mappings of the two files only if the file allocation units + * mapped to file1's range have been written. + */ +#define XFS_EXCHMAPS_INO1_WRITTEN (1ULL << 2) + +/* Clear the reflink flag from inode1 after the operation. */ +#define XFS_EXCHMAPS_CLEAR_INO1_REFLINK (1ULL << 3) + +/* Clear the reflink flag from inode2 after the operation. */ +#define XFS_EXCHMAPS_CLEAR_INO2_REFLINK (1ULL << 4) + +#define XFS_EXCHMAPS_LOGGED_FLAGS (XFS_EXCHMAPS_ATTR_FORK | \ + XFS_EXCHMAPS_SET_SIZES | \ + XFS_EXCHMAPS_INO1_WRITTEN | \ + XFS_EXCHMAPS_CLEAR_INO1_REFLINK | \ + XFS_EXCHMAPS_CLEAR_INO2_REFLINK) + +/* This is the structure used to lay out an mapping exchange done log item. */ +struct xfs_xmd_log_format { + uint16_t xmd_type; /* xmd log item type */ + uint16_t xmd_size; /* size of this item */ + uint32_t __pad; + uint64_t xmd_xmi_id; /* id of corresponding xmi */ +}; + +/* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into @@ -966,6 +1028,9 @@ struct xfs_icreate_log { #define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ #define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ #define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_PPTR_SET 4 /* Set parent pointer */ +#define XFS_ATTRI_OP_FLAGS_PPTR_REMOVE 5 /* Remove parent pointer */ +#define XFS_ATTRI_OP_FLAGS_PPTR_REPLACE 6 /* Replace parent pointer */ #define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ /* @@ -974,6 +1039,7 @@ struct xfs_icreate_log { */ #define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ XFS_ATTR_SECURE | \ + XFS_ATTR_PARENT | \ XFS_ATTR_INCOMPLETE) /* @@ -983,11 +1049,22 @@ struct xfs_icreate_log { struct xfs_attri_log_format { uint16_t alfi_type; /* attri log item type */ uint16_t alfi_size; /* size of this item */ - uint32_t __pad; /* pad to 64 bit aligned */ + uint32_t alfi_igen; /* generation of alfi_ino for pptr ops */ uint64_t alfi_id; /* attri identifier */ uint64_t alfi_ino; /* the inode for this attr operation */ uint32_t alfi_op_flags; /* marks the op as a set or remove */ - uint32_t alfi_name_len; /* attr name length */ + union { + uint32_t alfi_name_len; /* attr name length */ + struct { + /* + * For PPTR_REPLACE, these are the lengths of the old + * and new attr names. The new and old values must + * have the same length. + */ + uint16_t alfi_old_name_len; + uint16_t alfi_new_name_len; + }; + }; uint32_t alfi_value_len; /* attr value length */ uint32_t alfi_attr_filter;/* attr filter flags */ }; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 9fe7a9564bca..521d327e4c89 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -75,6 +75,8 @@ extern const struct xlog_recover_item_ops xlog_cui_item_ops; extern const struct xlog_recover_item_ops xlog_cud_item_ops; extern const struct xlog_recover_item_ops xlog_attri_item_ops; extern const struct xlog_recover_item_ops xlog_attrd_item_ops; +extern const struct xlog_recover_item_ops xlog_xmi_item_ops; +extern const struct xlog_recover_item_ops xlog_xmd_item_ops; /* * Macros, structures, prototypes for internal log manager use. @@ -121,6 +123,8 @@ bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); +int xlog_recover_iget_handle(struct xfs_mount *mp, xfs_ino_t ino, uint32_t gen, + struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); int xlog_alloc_buf_cancel_table(struct xlog *log); diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 9975b93a7412..d3bd6a86c8fe 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -17,6 +17,34 @@ #include "xfs_trace.h" /* + * Shortly after enabling the large extents count feature in 2023, longstanding + * bugs were found in the code that computes the minimum log size. Luckily, + * the bugs resulted in over-estimates of that size, so there's no impact to + * existing users. However, we don't want to reduce the minimum log size + * because that can create the situation where a newer mkfs writes a new + * filesystem that an older kernel won't mount. + * + * Several years prior, we also discovered that the transaction reservations + * for rmap and reflink operations were unnecessarily large. That was fixed, + * but the minimum log size computation was left alone to avoid the + * compatibility problems noted above. Fix that too. + * + * Therefore, we only may correct the computation starting with filesystem + * features that didn't exist in 2023. In other words, only turn this on if + * the filesystem has parent pointers. + * + * This function can be called before the XFS_HAS_* flags have been set up, + * (e.g. mkfs) so we must check the ondisk superblock. + */ +static inline bool +xfs_want_minlogsize_fixes( + struct xfs_sb *sb) +{ + return xfs_sb_is_v5(sb) && + xfs_sb_has_incompat_feature(sb, XFS_SB_FEAT_INCOMPAT_PARENT); +} + +/* * Calculate the maximum length in bytes that would be required for a local * attribute value as large attributes out of line are not logged. */ @@ -31,6 +59,15 @@ xfs_log_calc_max_attrsetm_res( MAXNAMELEN - 1; nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); nblks += XFS_B_TO_FSB(mp, size); + + /* + * If the feature set is new enough, correct a unit conversion error in + * the xattr transaction reservation code that resulted in oversized + * minimum log size computations. + */ + if (xfs_want_minlogsize_fixes(&mp->m_sb)) + size = XFS_B_TO_FSB(mp, size); + nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); return M_RES(mp)->tr_attrsetm.tr_logres + @@ -49,6 +86,15 @@ xfs_log_calc_trans_resv_for_minlogblocks( unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; /* + * If the feature set is new enough, drop the oversized minimum log + * size computation introduced by the original reflink code. + */ + if (xfs_want_minlogsize_fixes(&mp->m_sb)) { + xfs_trans_resv_calc(mp, resv); + return; + } + + /* * In the early days of rmap+reflink, we always set the rmap maxlevels * to 9 even if the AG was small enough that it would never grow to * that height. Transaction reservation sizes influence the minimum diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 81885a6a028e..e8cdd77d03fa 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -119,6 +119,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); + XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); @@ -155,6 +156,11 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); + /* parent pointer ioctls */ + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40); + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_by_handle, 64); + /* * The v5 superblock format extended several v4 header structures with * additional data. While new fields are only accessible on v5 diff --git a/fs/xfs/libxfs/xfs_parent.c b/fs/xfs/libxfs/xfs_parent.c new file mode 100644 index 000000000000..69366c44a701 --- /dev/null +++ b/fs/xfs/libxfs/xfs_parent.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022-2024 Oracle. + * All rights reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_da_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_sf.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log.h" +#include "xfs_xattr.h" +#include "xfs_parent.h" +#include "xfs_trans_space.h" +#include "xfs_attr_item.h" +#include "xfs_health.h" + +struct kmem_cache *xfs_parent_args_cache; + +/* + * Parent pointer attribute handling. + * + * Because the attribute name is a filename component, it will never be longer + * than 255 bytes and must not contain nulls or slashes. These are roughly the + * same constraints that apply to attribute names. + * + * The attribute value must always be a struct xfs_parent_rec. This means the + * attribute will never be in remote format because 12 bytes is nowhere near + * xfs_attr_leaf_entsize_local_max() (~75% of block size). + * + * Creating a new parent attribute will always create a new attribute - there + * should never, ever be an existing attribute in the tree for a new inode. + * ENOSPC behavior is problematic - creating the inode without the parent + * pointer is effectively a corruption, so we allow parent attribute creation + * to dip into the reserve block pool to avoid unexpected ENOSPC errors from + * occurring. + */ + +/* Return true if parent pointer attr name is valid. */ +bool +xfs_parent_namecheck( + unsigned int attr_flags, + const void *name, + size_t length) +{ + /* + * Parent pointers always use logged operations, so there should never + * be incomplete xattrs. + */ + if (attr_flags & XFS_ATTR_INCOMPLETE) + return false; + + return xfs_dir2_namecheck(name, length); +} + +/* Return true if parent pointer attr value is valid. */ +bool +xfs_parent_valuecheck( + struct xfs_mount *mp, + const void *value, + size_t valuelen) +{ + const struct xfs_parent_rec *rec = value; + + if (!xfs_has_parent(mp)) + return false; + + /* The xattr value must be a parent record. */ + if (valuelen != sizeof(struct xfs_parent_rec)) + return false; + + /* The parent record must be local. */ + if (value == NULL) + return false; + + /* The parent inumber must be valid. */ + if (!xfs_verify_dir_ino(mp, be64_to_cpu(rec->p_ino))) + return false; + + return true; +} + +/* Compute the attribute name hash for a parent pointer. */ +xfs_dahash_t +xfs_parent_hashval( + struct xfs_mount *mp, + const uint8_t *name, + int namelen, + xfs_ino_t parent_ino) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + + /* + * Use the same dirent name hash as would be used on the directory, but + * mix in the parent inode number to avoid collisions on hardlinked + * files with identical names but different parents. + */ + return xfs_dir2_hashname(mp, &xname) ^ + upper_32_bits(parent_ino) ^ lower_32_bits(parent_ino); +} + +/* Compute the attribute name hash from the xattr components. */ +xfs_dahash_t +xfs_parent_hashattr( + struct xfs_mount *mp, + const uint8_t *name, + int namelen, + const void *value, + int valuelen) +{ + const struct xfs_parent_rec *rec = value; + + /* Requires a local attr value in xfs_parent_rec format */ + if (valuelen != sizeof(struct xfs_parent_rec)) { + ASSERT(valuelen == sizeof(struct xfs_parent_rec)); + return 0; + } + + if (!value) { + ASSERT(value != NULL); + return 0; + } + + return xfs_parent_hashval(mp, name, namelen, be64_to_cpu(rec->p_ino)); +} + +/* + * Initialize the parent pointer arguments structure. Caller must have zeroed + * the contents of @args. @tp is only required for updates. + */ +static void +xfs_parent_da_args_init( + struct xfs_da_args *args, + struct xfs_trans *tp, + struct xfs_parent_rec *rec, + struct xfs_inode *child, + xfs_ino_t owner, + const struct xfs_name *parent_name) +{ + args->geo = child->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->attr_filter = XFS_ATTR_PARENT; + args->op_flags = XFS_DA_OP_LOGGED | XFS_DA_OP_OKNOENT; + args->trans = tp; + args->dp = child; + args->owner = owner; + args->name = parent_name->name; + args->namelen = parent_name->len; + args->value = rec; + args->valuelen = sizeof(struct xfs_parent_rec); + xfs_attr_sethash(args); +} + +/* Make sure the incore state is ready for a parent pointer query/update. */ +static inline int +xfs_parent_iread_extents( + struct xfs_trans *tp, + struct xfs_inode *child) +{ + /* Parent pointers require that the attr fork must exist. */ + if (XFS_IS_CORRUPT(child->i_mount, !xfs_inode_has_attr_fork(child))) { + xfs_inode_mark_sick(child, XFS_SICK_INO_PARENT); + return -EFSCORRUPTED; + } + + return xfs_iread_extents(tp, child, XFS_ATTR_FORK); +} + +/* Add a parent pointer to reflect a dirent addition. */ +int +xfs_parent_addname( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *dp, + const struct xfs_name *parent_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, parent_name); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_SET); + return 0; +} + +/* Remove a parent pointer to reflect a dirent removal. */ +int +xfs_parent_removename( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *dp, + const struct xfs_name *parent_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, parent_name); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REMOVE); + return 0; +} + +/* Replace one parent pointer with another to reflect a rename. */ +int +xfs_parent_replacename( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *old_dp, + const struct xfs_name *old_name, + struct xfs_inode *new_dp, + const struct xfs_name *new_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, old_dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, old_name); + + xfs_inode_to_parent_rec(&ppargs->new_rec, new_dp); + ppargs->args.new_name = new_name->name; + ppargs->args.new_namelen = new_name->len; + ppargs->args.new_value = &ppargs->new_rec; + ppargs->args.new_valuelen = sizeof(struct xfs_parent_rec); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REPLACE); + return 0; +} + +/* + * Extract parent pointer information from any parent pointer xattr into + * @parent_ino/gen. The last two parameters can be NULL pointers. + * + * Returns 0 if this is not a parent pointer xattr at all; or -EFSCORRUPTED for + * garbage. + */ +int +xfs_parent_from_attr( + struct xfs_mount *mp, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + xfs_ino_t *parent_ino, + uint32_t *parent_gen) +{ + const struct xfs_parent_rec *rec = value; + + ASSERT(attr_flags & XFS_ATTR_PARENT); + + if (!xfs_parent_namecheck(attr_flags, name, namelen)) + return -EFSCORRUPTED; + if (!xfs_parent_valuecheck(mp, value, valuelen)) + return -EFSCORRUPTED; + + if (parent_ino) + *parent_ino = be64_to_cpu(rec->p_ino); + if (parent_gen) + *parent_gen = be32_to_cpu(rec->p_gen); + return 0; +} + +/* + * Look up a parent pointer record (@parent_name -> @pptr) of @ip. + * + * Caller must hold at least ILOCK_SHARED. The scratchpad need not be + * initialized. + * + * Returns 0 if the pointer is found, -ENOATTR if there is no match, or a + * negative errno. + */ +int +xfs_parent_lookup( + struct xfs_trans *tp, + struct xfs_inode *ip, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, tp, pptr, ip, ip->i_ino, parent_name); + return xfs_attr_get_ilocked(scratch); +} + +/* Sanity-check a parent pointer before we try to perform repairs. */ +static inline bool +xfs_parent_sanity_check( + struct xfs_mount *mp, + const struct xfs_name *parent_name, + const struct xfs_parent_rec *pptr) +{ + if (!xfs_parent_namecheck(XFS_ATTR_PARENT, parent_name->name, + parent_name->len)) + return false; + + if (!xfs_parent_valuecheck(mp, pptr, sizeof(*pptr))) + return false; + + return true; +} + + +/* + * Attach the parent pointer (@parent_name -> @pptr) to @ip immediately. + * Caller must not have a transaction or hold the ILOCK. This is for + * specialized repair functions only. The scratchpad need not be initialized. + */ +int +xfs_parent_set( + struct xfs_inode *ip, + xfs_ino_t owner, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name); + return xfs_attr_set(scratch, XFS_ATTRUPDATE_CREATE, false); +} + +/* + * Remove the parent pointer (@parent_name -> @pptr) from @ip immediately. + * Caller must not have a transaction or hold the ILOCK. This is for + * specialized repair functions only. The scratchpad need not be initialized. + */ +int +xfs_parent_unset( + struct xfs_inode *ip, + xfs_ino_t owner, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name); + return xfs_attr_set(scratch, XFS_ATTRUPDATE_REMOVE, false); +} diff --git a/fs/xfs/libxfs/xfs_parent.h b/fs/xfs/libxfs/xfs_parent.h new file mode 100644 index 000000000000..b8036527cdc7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_parent.h @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022-2024 Oracle. + * All Rights Reserved. + */ +#ifndef __XFS_PARENT_H__ +#define __XFS_PARENT_H__ + +/* Metadata validators */ +bool xfs_parent_namecheck(unsigned int attr_flags, const void *name, + size_t length); +bool xfs_parent_valuecheck(struct xfs_mount *mp, const void *value, + size_t valuelen); + +xfs_dahash_t xfs_parent_hashval(struct xfs_mount *mp, const uint8_t *name, + int namelen, xfs_ino_t parent_ino); +xfs_dahash_t xfs_parent_hashattr(struct xfs_mount *mp, const uint8_t *name, + int namelen, const void *value, int valuelen); + +/* Initializes a xfs_parent_rec to be stored as an attribute name. */ +static inline void +xfs_parent_rec_init( + struct xfs_parent_rec *rec, + xfs_ino_t ino, + uint32_t gen) +{ + rec->p_ino = cpu_to_be64(ino); + rec->p_gen = cpu_to_be32(gen); +} + +/* Initializes a xfs_parent_rec to be stored as an attribute name. */ +static inline void +xfs_inode_to_parent_rec( + struct xfs_parent_rec *rec, + const struct xfs_inode *dp) +{ + xfs_parent_rec_init(rec, dp->i_ino, VFS_IC(dp)->i_generation); +} + +extern struct kmem_cache *xfs_parent_args_cache; + +/* + * Parent pointer information needed to pass around the deferred xattr update + * machinery. + */ +struct xfs_parent_args { + struct xfs_parent_rec rec; + struct xfs_parent_rec new_rec; + struct xfs_da_args args; +}; + +/* + * Start a parent pointer update by allocating the context object we need to + * perform a parent pointer update. + */ +static inline int +xfs_parent_start( + struct xfs_mount *mp, + struct xfs_parent_args **ppargsp) +{ + if (!xfs_has_parent(mp)) { + *ppargsp = NULL; + return 0; + } + + *ppargsp = kmem_cache_zalloc(xfs_parent_args_cache, GFP_KERNEL); + if (!*ppargsp) + return -ENOMEM; + return 0; +} + +/* Finish a parent pointer update by freeing the context object. */ +static inline void +xfs_parent_finish( + struct xfs_mount *mp, + struct xfs_parent_args *ppargs) +{ + if (ppargs) + kmem_cache_free(xfs_parent_args_cache, ppargs); +} + +int xfs_parent_addname(struct xfs_trans *tp, struct xfs_parent_args *ppargs, + struct xfs_inode *dp, const struct xfs_name *parent_name, + struct xfs_inode *child); +int xfs_parent_removename(struct xfs_trans *tp, struct xfs_parent_args *ppargs, + struct xfs_inode *dp, const struct xfs_name *parent_name, + struct xfs_inode *child); +int xfs_parent_replacename(struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *old_dp, const struct xfs_name *old_name, + struct xfs_inode *new_dp, const struct xfs_name *new_name, + struct xfs_inode *child); + +int xfs_parent_from_attr(struct xfs_mount *mp, unsigned int attr_flags, + const unsigned char *name, unsigned int namelen, + const void *value, unsigned int valuelen, + xfs_ino_t *parent_ino, uint32_t *parent_gen); + +/* Repair functions */ +int xfs_parent_lookup(struct xfs_trans *tp, struct xfs_inode *ip, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); +int xfs_parent_set(struct xfs_inode *ip, xfs_ino_t owner, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); +int xfs_parent_unset(struct xfs_inode *ip, xfs_ino_t owner, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); + +#endif /* __XFS_PARENT_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f246d6dbf4ec..386b672c5058 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1168,3 +1168,60 @@ xfs_rtsummary_wordcount( blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks); return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; } + +/* + * Lock both realtime free space metadata inodes for a freespace update. If a + * transaction is given, the inodes will be joined to the transaction and the + * ILOCKs will be released on transaction commit. + */ +void +xfs_rtbitmap_lock( + struct xfs_trans *tp, + struct xfs_mount *mp) +{ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + if (tp) + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + if (tp) + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); +} + +/* Unlock both realtime free space metadata inodes after a freespace update. */ +void +xfs_rtbitmap_unlock( + struct xfs_mount *mp) +{ + xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); +} + +/* + * Lock the realtime free space metadata inodes for a freespace scan. Callers + * must walk metadata blocks in order of increasing file offset. + */ +void +xfs_rtbitmap_lock_shared( + struct xfs_mount *mp, + unsigned int rbmlock_flags) +{ + if (rbmlock_flags & XFS_RBMLOCK_BITMAP) + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + + if (rbmlock_flags & XFS_RBMLOCK_SUMMARY) + xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM); +} + +/* Unlock the realtime free space metadata inodes after a freespace scan. */ +void +xfs_rtbitmap_unlock_shared( + struct xfs_mount *mp, + unsigned int rbmlock_flags) +{ + if (rbmlock_flags & XFS_RBMLOCK_SUMMARY) + xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM); + + if (rbmlock_flags & XFS_RBMLOCK_BITMAP) + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); +} diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 152a66750af5..6186585f2c37 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -360,6 +360,19 @@ xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, unsigned int rsumlevels, xfs_extlen_t rbmblocks); unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, unsigned int rsumlevels, xfs_extlen_t rbmblocks); + +void xfs_rtbitmap_lock(struct xfs_trans *tp, struct xfs_mount *mp); +void xfs_rtbitmap_unlock(struct xfs_mount *mp); + +/* Lock the rt bitmap inode in shared mode */ +#define XFS_RBMLOCK_BITMAP (1U << 0) +/* Lock the rt summary inode in shared mode */ +#define XFS_RBMLOCK_SUMMARY (1U << 1) + +void xfs_rtbitmap_lock_shared(struct xfs_mount *mp, + unsigned int rbmlock_flags); +void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp, + unsigned int rbmlock_flags); #else /* CONFIG_XFS_RT */ # define xfs_rtfree_extent(t,b,l) (-ENOSYS) # define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) @@ -378,6 +391,10 @@ xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) # define xfs_rtbitmap_wordcount(mp, r) (0) # define xfs_rtsummary_blockcount(mp, l, b) (0) # define xfs_rtsummary_wordcount(mp, l, b) (0) +# define xfs_rtbitmap_lock(tp, mp) do { } while (0) +# define xfs_rtbitmap_unlock(mp) do { } while (0) +# define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0) +# define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0) #endif /* CONFIG_XFS_RT */ #endif /* __XFS_RTBITMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 73a4b895de67..09e4bf949bf8 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -26,6 +26,7 @@ #include "xfs_health.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_exchrange.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -175,6 +176,10 @@ xfs_sb_version_to_features( features |= XFS_FEAT_NEEDSREPAIR; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) features |= XFS_FEAT_NREXT64; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE) + features |= XFS_FEAT_EXCHANGE_RANGE; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT) + features |= XFS_FEAT_PARENT; return features; } @@ -1251,6 +1256,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_has_inobtcounts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; + if (xfs_has_parent(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_PARENT; if (xfs_has_sector(mp)) { geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; geo->logsectsize = sbp->sb_logsectsize; @@ -1259,6 +1266,8 @@ xfs_fs_geometry( } if (xfs_has_large_extent_counts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; + if (xfs_has_exchange_range(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index dfd61fa8332e..34f104ed372c 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -124,7 +124,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_RES_FDBLKS (1u << 6) /* Transaction contains an intent done log item */ #define XFS_TRANS_HAS_INTENT_DONE (1u << 7) - /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an @@ -136,7 +135,10 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, * for free space from AG 0. If the correct transaction reservations have been * made then this algorithm will eventually find all the space it needs. */ -#define XFS_TRANS_LOWMODE 0x100 /* allocate in low space mode */ +#define XFS_TRANS_LOWMODE (1u << 8) + +/* Transaction has locked the rtbitmap and rtsum inodes */ +#define XFS_TRANS_RTBITMAP_LOCKED (1u << 9) /* * Field values for xfs_trans_mod_sb. diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index ffb1317a9212..f228127a88ff 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -169,7 +169,8 @@ xfs_symlink_local_to_remote( struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, - struct xfs_ifork *ifp) + struct xfs_ifork *ifp, + void *priv) { struct xfs_mount *mp = ip->i_mount; char *buf; @@ -310,6 +311,7 @@ int xfs_symlink_write_target( struct xfs_trans *tp, struct xfs_inode *ip, + xfs_ino_t owner, const char *target_path, int pathlen, xfs_fsblock_t fs_blocks, @@ -364,8 +366,7 @@ xfs_symlink_write_target( byte_cnt = min(byte_cnt, pathlen); buf = bp->b_addr; - buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt, - bp); + buf += xfs_symlink_hdr_set(mp, owner, offset, byte_cnt, bp); memcpy(buf, cur_chunk, byte_cnt); @@ -380,3 +381,50 @@ xfs_symlink_write_target( ASSERT(pathlen == 0); return 0; } + +/* Remove all the blocks from a symlink and invalidate buffers. */ +int +xfs_symlink_remote_truncate( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp; + int nmaps = XFS_SYMLINK_MAPS; + int done = 0; + int i; + int error; + + /* Read mappings and invalidate buffers. */ + error = xfs_bmapi_read(ip, 0, XFS_MAX_FILEOFF, mval, &nmaps, 0); + if (error) + return error; + + for (i = 0; i < nmaps; i++) { + if (!xfs_bmap_is_real_extent(&mval[i])) + break; + + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, + &bp); + if (error) + return error; + + xfs_trans_binval(tp, bp); + } + + /* Unmap the remote blocks. */ + error = xfs_bunmapi(tp, ip, 0, XFS_MAX_FILEOFF, 0, nmaps, &done); + if (error) + return error; + if (!done) { + ASSERT(done); + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + return -EFSCORRUPTED; + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h index a63bd38ae4fa..c1672fe1f17b 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.h +++ b/fs/xfs/libxfs/xfs_symlink_remote.h @@ -16,11 +16,13 @@ int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_inode *ip, struct xfs_ifork *ifp); + struct xfs_inode *ip, struct xfs_ifork *ifp, + void *priv); xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); int xfs_symlink_remote_read(struct xfs_inode *ip, char *link); int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip, - const char *target_path, int pathlen, xfs_fsblock_t fs_blocks, - uint resblks); + xfs_ino_t owner, const char *target_path, int pathlen, + xfs_fsblock_t fs_blocks, uint resblks); +int xfs_symlink_remote_truncate(struct xfs_trans *tp, struct xfs_inode *ip); #endif /* __XFS_SYMLINK_REMOTE_H */ diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6cd45e8c118d..6dbe6e7251e7 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -20,6 +20,9 @@ #include "xfs_qm.h" #include "xfs_trans_space.h" #include "xfs_rtbitmap.h" +#include "xfs_attr_item.h" +#include "xfs_log.h" +#include "xfs_da_format.h" #define _ALLOC true #define _FREE false @@ -422,29 +425,110 @@ xfs_calc_itruncate_reservation_minlogsize( return xfs_calc_itruncate_reservation(mp, true); } +static inline unsigned int xfs_calc_pptr_link_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} +static inline unsigned int xfs_calc_pptr_unlink_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} +static inline unsigned int xfs_calc_pptr_replace_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} + /* * In renaming a files we can modify: * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets - * of bmap blocks) giving: + * of bmap blocks) giving (t2): * the agf for the ags in which the blocks live: 3 * sector size * the agfl for the ags in which the blocks live: 3 * sector size * the superblock for the free block count: sector size * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + * If parent pointers are enabled (t3), then each transaction in the chain + * must be capable of setting or removing the extended attribute + * containing the parent information. It must also be able to handle + * the three xattr intent items that track the progress of the parent + * pointer update. */ STATIC uint xfs_calc_rename_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 5) + - xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES(mp); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + t1 = xfs_calc_inode_res(mp, 5) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1)); + + t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), + XFS_FSB_TO_B(mp, 1)); + + if (xfs_has_parent(mp)) { + unsigned int rename_overhead, exchange_overhead; + + t3 = max(resp->tr_attrsetm.tr_logres, + resp->tr_attrrm.tr_logres); + + /* + * For a standard rename, the three xattr intent log items + * are (1) replacing the pptr for the source file; (2) + * removing the pptr on the dest file; and (3) adding a + * pptr for the whiteout file in the src dir. + * + * For an RENAME_EXCHANGE, there are two xattr intent + * items to replace the pptr for both src and dest + * files. Link counts don't change and there is no + * whiteout. + * + * In the worst case we can end up relogging all log + * intent items to allow the log tail to move ahead, so + * they become overhead added to each transaction in a + * processing chain. + */ + rename_overhead = xfs_calc_pptr_replace_overhead() + + xfs_calc_pptr_unlink_overhead() + + xfs_calc_pptr_link_overhead(); + exchange_overhead = 2 * xfs_calc_pptr_replace_overhead(); + + overhead += max(rename_overhead, exchange_overhead); + } + + return overhead + max3(t1, t2, t3); +} + +static inline unsigned int +xfs_rename_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* One for the rename, one more for freeing blocks */ + unsigned int ret = XFS_RENAME_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to remove or add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += max(resp->tr_attrsetm.tr_logcount, + resp->tr_attrrm.tr_logcount); + + return ret; } /* @@ -461,6 +545,23 @@ xfs_calc_iunlink_remove_reservation( 2 * M_IGEO(mp)->inode_cluster_size; } +static inline unsigned int +xfs_link_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_LINK_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + /* * For creating a link to an inode: * the parent directory inode: inode size @@ -477,14 +578,23 @@ STATIC uint xfs_calc_link_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_iunlink_remove_reservation(mp) + - max((xfs_calc_inode_res(mp, 2) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES(mp); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + overhead += xfs_calc_iunlink_remove_reservation(mp); + t1 = xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), + XFS_FSB_TO_B(mp, 1)); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrsetm.tr_logres; + overhead += xfs_calc_pptr_link_overhead(); + } + + return overhead + max3(t1, t2, t3); } /* @@ -499,6 +609,23 @@ xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) M_IGEO(mp)->inode_cluster_size; } +static inline unsigned int +xfs_remove_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_REMOVE_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrrm.tr_logcount; + + return ret; +} + /* * For removing a directory entry we can modify: * the parent directory inode: inode size @@ -515,14 +642,24 @@ STATIC uint xfs_calc_remove_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_iunlink_add_reservation(mp) + - max((xfs_calc_inode_res(mp, 2) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES(mp); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + overhead += xfs_calc_iunlink_add_reservation(mp); + + t1 = xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), + XFS_FSB_TO_B(mp, 1)); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrrm.tr_logres; + overhead += xfs_calc_pptr_unlink_overhead(); + } + + return overhead + max3(t1, t2, t3); } /* @@ -571,12 +708,40 @@ xfs_calc_icreate_resv_alloc( xfs_calc_finobt_res(mp); } +static inline unsigned int +xfs_icreate_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_CREATE_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + STATIC uint -xfs_calc_icreate_reservation(xfs_mount_t *mp) +xfs_calc_icreate_reservation( + struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max(xfs_calc_icreate_resv_alloc(mp), - xfs_calc_create_resv_modify(mp)); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int overhead = XFS_DQUOT_LOGRES(mp); + unsigned int t1, t2, t3 = 0; + + t1 = xfs_calc_icreate_resv_alloc(mp); + t2 = xfs_calc_create_resv_modify(mp); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrsetm.tr_logres; + overhead += xfs_calc_pptr_link_overhead(); + } + + return overhead + max3(t1, t2, t3); } STATIC uint @@ -589,6 +754,23 @@ xfs_calc_create_tmpfile_reservation( return res + xfs_calc_iunlink_add_reservation(mp); } +static inline unsigned int +xfs_mkdir_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_MKDIR_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + /* * Making a new directory is the same as creating a new file. */ @@ -599,6 +781,22 @@ xfs_calc_mkdir_reservation( return xfs_calc_icreate_reservation(mp); } +static inline unsigned int +xfs_symlink_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_SYMLINK_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} /* * Making a new symplink is the same as creating a new file, but @@ -911,54 +1109,76 @@ xfs_calc_sb_reservation( return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); } -void -xfs_trans_resv_calc( +/* + * Namespace reservations. + * + * These get tricky when parent pointers are enabled as we have attribute + * modifications occurring from within these transactions. Rather than confuse + * each of these reservation calculations with the conditional attribute + * reservations, add them here in a clear and concise manner. This requires that + * the attribute reservations have already been calculated. + * + * Note that we only include the static attribute reservation here; the runtime + * reservation will have to be modified by the size of the attributes being + * added/removed/modified. See the comments on the attribute reservation + * calculations for more details. + */ +STATIC void +xfs_calc_namespace_reservations( struct xfs_mount *mp, struct xfs_trans_resv *resp) { - int logcount_adj = 0; - - /* - * The following transactions are logged in physical format and - * require a permanent reservation on space. - */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; - resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; - resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + ASSERT(resp->tr_attrsetm.tr_logres > 0); resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); - resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT; + resp->tr_rename.tr_logcount = xfs_rename_log_count(mp, resp); resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_link.tr_logres = xfs_calc_link_reservation(mp); - resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT; + resp->tr_link.tr_logcount = xfs_link_log_count(mp, resp); resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp); - resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT; + resp->tr_remove.tr_logcount = xfs_remove_log_count(mp, resp); resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp); - resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; + resp->tr_symlink.tr_logcount = xfs_symlink_log_count(mp, resp); resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp); - resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; + resp->tr_create.tr_logcount = xfs_icreate_log_count(mp, resp); resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); + resp->tr_mkdir.tr_logcount = xfs_mkdir_log_count(mp, resp); + resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; +} + +void +xfs_trans_resv_calc( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + int logcount_adj = 0; + + /* + * The following transactions are logged in physical format and + * require a permanent reservation on space. + */ + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_create_tmpfile.tr_logres = xfs_calc_create_tmpfile_reservation(mp); resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); - resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; - resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp); resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT; resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES; @@ -988,6 +1208,8 @@ xfs_trans_resv_calc( resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + xfs_calc_namespace_reservations(mp, resp); + /* * The following transactions are logged in logical format with * a default log count. diff --git a/fs/xfs/libxfs/xfs_trans_space.c b/fs/xfs/libxfs/xfs_trans_space.c new file mode 100644 index 000000000000..b9dc3752f702 --- /dev/null +++ b/fs/xfs/libxfs/xfs_trans_space.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_da_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" + +/* Calculate the disk space required to add a parent pointer. */ +unsigned int +xfs_parent_calc_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + /* + * Parent pointers are always the first attr in an attr tree, and never + * larger than a block + */ + return XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + + XFS_NEXTENTADD_SPACE_RES(mp, namelen, XFS_ATTR_FORK); +} + +unsigned int +xfs_create_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret; + + ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen); + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_mkdir_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + return xfs_create_space_res(mp, namelen); +} + +unsigned int +xfs_link_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret; + + ret = XFS_DIRENTER_SPACE_RES(mp, namelen); + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_symlink_space_res( + struct xfs_mount *mp, + unsigned int namelen, + unsigned int fsblocks) +{ + unsigned int ret; + + ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen) + + fsblocks; + + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_remove_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret = XFS_DIRREMOVE_SPACE_RES(mp); + + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_rename_space_res( + struct xfs_mount *mp, + unsigned int src_namelen, + bool target_exists, + unsigned int target_namelen, + bool has_whiteout) +{ + unsigned int ret; + + ret = XFS_DIRREMOVE_SPACE_RES(mp) + + XFS_DIRENTER_SPACE_RES(mp, target_namelen); + + if (xfs_has_parent(mp)) { + if (has_whiteout) + ret += xfs_parent_calc_space_res(mp, src_namelen); + ret += 2 * xfs_parent_calc_space_res(mp, target_namelen); + } + + if (target_exists) + ret += xfs_parent_calc_space_res(mp, target_namelen); + + return ret; +} diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 87b31c69a773..1155ff2d37e2 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -10,6 +10,10 @@ * Components of space reservations. */ +/* Worst case number of bmaps that can be held in a block. */ +#define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \ + (((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0])) + /* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) @@ -76,31 +80,32 @@ /* This macro is not used - see inline code in xfs_attr_set */ #define XFS_ATTRSET_SPACE_RES(mp, v) \ (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v)) -#define XFS_CREATE_SPACE_RES(mp,nl) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ (2 * (mp)->m_alloc_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) -#define XFS_LINK_SPACE_RES(mp,nl) \ - XFS_DIRENTER_SPACE_RES(mp,nl) -#define XFS_MKDIR_SPACE_RES(mp,nl) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_QM_DQALLOC_SPACE_RES(mp) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \ XFS_DQUOT_CLUSTER_SIZE_FSB) #define XFS_QM_QINOCREATE_SPACE_RES(mp) \ XFS_IALLOC_SPACE_RES(mp) -#define XFS_REMOVE_SPACE_RES(mp) \ - XFS_DIRREMOVE_SPACE_RES(mp) -#define XFS_RENAME_SPACE_RES(mp,nl) \ - (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) -#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) #define XFS_IFREE_SPACE_RES(mp) \ (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0) +unsigned int xfs_parent_calc_space_res(struct xfs_mount *mp, + unsigned int namelen); + +unsigned int xfs_create_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_mkdir_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_link_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_symlink_space_res(struct xfs_mount *mp, unsigned int namelen, + unsigned int fsblocks); +unsigned int xfs_remove_space_res(struct xfs_mount *mp, unsigned int namelen); + +unsigned int xfs_rename_space_res(struct xfs_mount *mp, + unsigned int src_namelen, bool target_exists, + unsigned int target_namelen, bool has_whiteout); #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index e954f07679dd..f8e5b67128d2 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -15,6 +15,7 @@ #include "xfs_ialloc.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_inode.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -165,8 +166,7 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); /* Check sb_versionnum bits that are set at mkfs time. */ - vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS | - XFS_SB_VERSION_NUMBITS | + vernum_mask = cpu_to_be16(XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALIGNBIT | XFS_SB_VERSION_DALIGNBIT | XFS_SB_VERSION_SHAREDBIT | @@ -865,6 +865,43 @@ xchk_agi_xref( /* scrub teardown will take care of sc->sa for us */ } +/* + * Check the unlinked buckets for links to bad inodes. We hold the AGI, so + * there cannot be any threads updating unlinked list pointers in this AG. + */ +STATIC void +xchk_iunlink( + struct xfs_scrub *sc, + struct xfs_agi *agi) +{ + unsigned int i; + struct xfs_inode *ip; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + xfs_agino_t agino = be32_to_cpu(agi->agi_unlinked[i]); + + while (agino != NULLAGINO) { + if (agino % XFS_AGI_UNLINKED_BUCKETS != i) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + ip = xfs_iunlink_lookup(sc->sa.pag, agino); + if (!ip) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + if (!xfs_inode_on_unlinked_list(ip)) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + agino = ip->i_next_unlinked; + } + } +} + /* Scrub the AGI. */ int xchk_agi( @@ -949,6 +986,8 @@ xchk_agi( if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); + xchk_iunlink(sc, agi); + xchk_agi_xref(sc); out: return error; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 427054b65b23..0dbc484b182f 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -21,13 +21,18 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_ag.h" +#include "xfs_inode.h" +#include "xfs_iunlink_item.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" +#include "scrub/agino_bitmap.h" #include "scrub/reap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" /* Superblock */ @@ -796,15 +801,57 @@ enum { XREP_AGI_MAX }; +#define XREP_AGI_LOOKUP_BATCH 32 + +struct xrep_agi { + struct xfs_scrub *sc; + + /* AGI buffer, tracked separately */ + struct xfs_buf *agi_bp; + + /* context for finding btree roots */ + struct xrep_find_ag_btree fab[XREP_AGI_MAX]; + + /* old AGI contents in case we have to revert */ + struct xfs_agi old_agi; + + /* bitmap of which inodes are unlinked */ + struct xagino_bitmap iunlink_bmp; + + /* heads of the unlinked inode bucket lists */ + xfs_agino_t iunlink_heads[XFS_AGI_UNLINKED_BUCKETS]; + + /* scratchpad for batched lookups of the radix tree */ + struct xfs_inode *lookup_batch[XREP_AGI_LOOKUP_BATCH]; + + /* Map of ino -> next_ino for unlinked inode processing. */ + struct xfarray *iunlink_next; + + /* Map of ino -> prev_ino for unlinked inode processing. */ + struct xfarray *iunlink_prev; +}; + +static void +xrep_agi_buf_cleanup( + void *buf) +{ + struct xrep_agi *ragi = buf; + + xfarray_destroy(ragi->iunlink_prev); + xfarray_destroy(ragi->iunlink_next); + xagino_bitmap_destroy(&ragi->iunlink_bmp); +} + /* * Given the inode btree roots described by *fab, find the roots, check them * for sanity, and pass the root data back out via *fab. */ STATIC int xrep_agi_find_btrees( - struct xfs_scrub *sc, - struct xrep_find_ag_btree *fab) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xrep_find_ag_btree *fab = ragi->fab; struct xfs_buf *agf_bp; struct xfs_mount *mp = sc->mp; int error; @@ -837,10 +884,11 @@ xrep_agi_find_btrees( */ STATIC void xrep_agi_init_header( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp, - struct xfs_agi *old_agi) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; + struct xfs_agi *old_agi = &ragi->old_agi; struct xfs_agi *agi = agi_bp->b_addr; struct xfs_perag *pag = sc->sa.pag; struct xfs_mount *mp = sc->mp; @@ -856,10 +904,6 @@ xrep_agi_init_header( if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - /* We don't know how to fix the unlinked list yet. */ - memcpy(&agi->agi_unlinked, &old_agi->agi_unlinked, - sizeof(agi->agi_unlinked)); - /* Mark the incore AGF data stale until we're done fixing things. */ ASSERT(xfs_perag_initialised_agi(pag)); clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); @@ -868,10 +912,12 @@ xrep_agi_init_header( /* Set btree root information in an AGI. */ STATIC void xrep_agi_set_roots( - struct xfs_scrub *sc, - struct xfs_agi *agi, - struct xrep_find_ag_btree *fab) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_agi *agi = ragi->agi_bp->b_addr; + struct xrep_find_ag_btree *fab = ragi->fab; + agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root); agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height); @@ -884,9 +930,10 @@ xrep_agi_set_roots( /* Update the AGI counters. */ STATIC int xrep_agi_calc_from_btrees( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; struct xfs_btree_cur *cur; struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; @@ -928,12 +975,721 @@ err: return error; } +/* + * Record a forwards unlinked chain pointer from agino -> next_agino in our + * staging information. + */ +static inline int +xrep_iunlink_store_next( + struct xrep_agi *ragi, + xfs_agino_t agino, + xfs_agino_t next_agino) +{ + ASSERT(next_agino != 0); + + return xfarray_store(ragi->iunlink_next, agino, &next_agino); +} + +/* + * Record a backwards unlinked chain pointer from prev_ino <- agino in our + * staging information. + */ +static inline int +xrep_iunlink_store_prev( + struct xrep_agi *ragi, + xfs_agino_t agino, + xfs_agino_t prev_agino) +{ + ASSERT(prev_agino != 0); + + return xfarray_store(ragi->iunlink_prev, agino, &prev_agino); +} + +/* + * Given an @agino, look up the next inode in the iunlink bucket. Returns + * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory + * like it should be, or a per-AG inode number. + */ +static inline xfs_agino_t +xrep_iunlink_next( + struct xfs_scrub *sc, + xfs_agino_t agino) +{ + struct xfs_inode *ip; + + ip = xfs_iunlink_lookup(sc->sa.pag, agino); + if (!ip) + return 0; + + return ip->i_next_unlinked; +} + +/* + * Load the inode @agino into memory, set its i_prev_unlinked, and drop the + * inode so it can be inactivated. Returns NULLAGINO if we're at the end of + * the chain or if we should stop walking the chain due to corruption; or a + * per-AG inode number. + */ +STATIC xfs_agino_t +xrep_iunlink_reload_next( + struct xrep_agi *ragi, + xfs_agino_t prev_agino, + xfs_agino_t agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_inode *ip; + xfs_ino_t ino; + xfs_agino_t ret = NULLAGINO; + int error; + + ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino); + error = xchk_iget(ragi->sc, ino, &ip); + if (error) + return ret; + + trace_xrep_iunlink_reload_next(ip, prev_agino); + + /* If this is a linked inode, stop processing the chain. */ + if (VFS_I(ip)->i_nlink != 0) { + xrep_iunlink_store_next(ragi, agino, NULLAGINO); + goto rele; + } + + ip->i_prev_unlinked = prev_agino; + ret = ip->i_next_unlinked; + + /* + * Drop the inode reference that we just took. We hold the AGI, so + * this inode cannot move off the unlinked list and hence cannot be + * reclaimed. + */ +rele: + xchk_irele(sc, ip); + return ret; +} + +/* + * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that + * still existed at mount time. This can happen if iunlink processing fails + * during log recovery. + */ +STATIC int +xrep_iunlink_walk_ondisk_bucket( + struct xrep_agi *ragi, + unsigned int bucket) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agino_t prev_agino = NULLAGINO; + xfs_agino_t next_agino; + int error = 0; + + next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (next_agino != NULLAGINO) { + xfs_agino_t agino = next_agino; + + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket, + prev_agino, agino); + + if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS) + break; + + next_agino = xrep_iunlink_next(sc, agino); + if (!next_agino) + next_agino = xrep_iunlink_reload_next(ragi, prev_agino, + agino); + + prev_agino = agino; + } + + return 0; +} + +/* Decide if this is an unlinked inode in this AG. */ +STATIC bool +xrep_iunlink_igrab( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = pag->pag_mount; + + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + return false; + + if (!xfs_inode_on_unlinked_list(ip)) + return false; + + return true; +} + +/* + * Mark the given inode in the lookup batch in our unlinked inode bitmap, and + * remember if this inode is the start of the unlinked chain. + */ +STATIC int +xrep_iunlink_visit( + struct xrep_agi *ragi, + unsigned int batch_idx) +{ + struct xfs_mount *mp = ragi->sc->mp; + struct xfs_inode *ip = ragi->lookup_batch[batch_idx]; + xfs_agino_t agino; + unsigned int bucket; + int error; + + ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno); + ASSERT(xfs_inode_on_unlinked_list(ip)); + + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + + trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket, + ragi->iunlink_heads[bucket], ip); + + error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1); + if (error) + return error; + + if (ip->i_prev_unlinked == NULLAGINO) { + if (ragi->iunlink_heads[bucket] == NULLAGINO) + ragi->iunlink_heads[bucket] = agino; + } + + return 0; +} + +/* + * Find all incore unlinked inodes so that we can rebuild the unlinked buckets. + * We hold the AGI so there should not be any modifications to the unlinked + * list. + */ +STATIC int +xrep_iunlink_mark_incore( + struct xrep_agi *ragi) +{ + struct xfs_perag *pag = ragi->sc->sa.pag; + struct xfs_mount *mp = pag->pag_mount; + uint32_t first_index = 0; + bool done = false; + unsigned int nr_found = 0; + + do { + unsigned int i; + int error = 0; + + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + rcu_read_lock(); + + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)&ragi->lookup_batch, first_index, + XREP_AGI_LOOKUP_BATCH); + if (!nr_found) { + rcu_read_unlock(); + return 0; + } + + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = ragi->lookup_batch[i]; + + if (done || !xrep_iunlink_igrab(pag, ip)) + ragi->lookup_batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = true; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!ragi->lookup_batch[i]) + continue; + error = xrep_iunlink_visit(ragi, i); + if (error) + return error; + } + } while (!done); + + return 0; +} + +/* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */ +STATIC int +xrep_iunlink_mark_ondisk_rec( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xrep_agi *ragi = priv; + struct xfs_scrub *sc = ragi->sc; + struct xfs_mount *mp = cur->bc_mp; + xfs_agino_t agino; + unsigned int i; + int error = 0; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + + for (i = 0, agino = irec.ir_startino; + i < XFS_INODES_PER_CHUNK; + i++, agino++) { + struct xfs_inode *ip; + unsigned int len = 1; + + /* Skip free inodes */ + if (XFS_INOBT_MASK(i) & irec.ir_free) + continue; + /* Skip inodes we've seen before */ + if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len)) + continue; + + /* + * Skip incore inodes; these were already picked up by + * the _mark_incore step. + */ + rcu_read_lock(); + ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino); + rcu_read_unlock(); + if (ip) + continue; + + /* + * Try to look up this inode. If we can't get it, just move + * on because we haven't actually scrubbed the inobt or the + * inodes yet. + */ + error = xchk_iget(ragi->sc, + XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno, + agino), + &ip); + if (error) + continue; + + trace_xrep_iunlink_reload_ondisk(ip); + + if (VFS_I(ip)->i_nlink == 0) + error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1); + xchk_irele(sc, ip); + if (error) + break; + } + + return error; +} + +/* + * Find ondisk inodes that are unlinked and not in cache, and mark them in + * iunlink_bmp. We haven't checked the inobt yet, so we don't error out if + * the btree is corrupt. + */ +STATIC void +xrep_iunlink_mark_ondisk( + struct xrep_agi *ragi) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; + struct xfs_btree_cur *cur; + int error; + + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); + error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi); + xfs_btree_del_cursor(cur, error); +} + +/* + * Walk an iunlink bucket's inode list. For each inode that should be on this + * chain, clear its entry in in iunlink_bmp because it's ok and we don't need + * to touch it further. + */ +STATIC int +xrep_iunlink_resolve_bucket( + struct xrep_agi *ragi, + unsigned int bucket) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_inode *ip; + xfs_agino_t prev_agino = NULLAGINO; + xfs_agino_t next_agino = ragi->iunlink_heads[bucket]; + int error = 0; + + while (next_agino != NULLAGINO) { + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + /* Find the next inode in the chain. */ + ip = xfs_iunlink_lookup(sc->sa.pag, next_agino); + if (!ip) { + /* Inode not incore? Terminate the chain. */ + trace_xrep_iunlink_resolve_uncached(sc->sa.pag, + bucket, prev_agino, next_agino); + + next_agino = NULLAGINO; + break; + } + + if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) { + /* + * Inode is in the wrong bucket. Advance the list, + * but pretend we didn't see this inode. + */ + trace_xrep_iunlink_resolve_wronglist(sc->sa.pag, + bucket, prev_agino, next_agino); + + next_agino = ip->i_next_unlinked; + continue; + } + + if (!xfs_inode_on_unlinked_list(ip)) { + /* + * Incore inode doesn't think this inode is on an + * unlinked list. This is probably because we reloaded + * it from disk. Advance the list, but pretend we + * didn't see this inode; we'll fix that later. + */ + trace_xrep_iunlink_resolve_nolist(sc->sa.pag, + bucket, prev_agino, next_agino); + next_agino = ip->i_next_unlinked; + continue; + } + + trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino, + next_agino); + + /* + * Otherwise, this inode's unlinked pointers are ok. Clear it + * from the unlinked bitmap since we're done with it, and make + * sure the chain is still correct. + */ + error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1); + if (error) + return error; + + /* Remember the previous inode's next pointer. */ + if (prev_agino != NULLAGINO) { + error = xrep_iunlink_store_next(ragi, prev_agino, + next_agino); + if (error) + return error; + } + + /* Remember this inode's previous pointer. */ + error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino); + if (error) + return error; + + /* Advance the list and remember this inode. */ + prev_agino = next_agino; + next_agino = ip->i_next_unlinked; + } + + /* Update the previous inode's next pointer. */ + if (prev_agino != NULLAGINO) { + error = xrep_iunlink_store_next(ragi, prev_agino, next_agino); + if (error) + return error; + } + + return 0; +} + +/* Reinsert this unlinked inode into the head of the staged bucket list. */ +STATIC int +xrep_iunlink_add_to_bucket( + struct xrep_agi *ragi, + xfs_agino_t agino) +{ + xfs_agino_t current_head; + unsigned int bucket; + int error; + + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + + /* Point this inode at the current head of the bucket list. */ + current_head = ragi->iunlink_heads[bucket]; + + trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino, + current_head); + + error = xrep_iunlink_store_next(ragi, agino, current_head); + if (error) + return error; + + /* Remember the head inode's previous pointer. */ + if (current_head != NULLAGINO) { + error = xrep_iunlink_store_prev(ragi, current_head, agino); + if (error) + return error; + } + + ragi->iunlink_heads[bucket] = agino; + return 0; +} + +/* Reinsert unlinked inodes into the staged iunlink buckets. */ +STATIC int +xrep_iunlink_add_lost_inodes( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_agi *ragi = priv; + int error; + + for (; len > 0; start++, len--) { + error = xrep_iunlink_add_to_bucket(ragi, start); + if (error) + return error; + } + + return 0; +} + +/* + * Figure out the iunlink bucket values and find inodes that need to be + * reinserted into the list. + */ +STATIC int +xrep_iunlink_rebuild_buckets( + struct xrep_agi *ragi) +{ + unsigned int i; + int error; + + /* + * Walk the ondisk AGI unlinked list to find inodes that are on the + * list but aren't in memory. This can happen if a past log recovery + * tried to clear the iunlinked list but failed. Our scan rebuilds the + * unlinked list using incore inodes, so we must load and link them + * properly. + */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + error = xrep_iunlink_walk_ondisk_bucket(ragi, i); + if (error) + return error; + } + + /* + * Record all the incore unlinked inodes in iunlink_bmp that we didn't + * find by walking the ondisk iunlink buckets. This shouldn't happen, + * but we can't risk forgetting an inode somewhere. + */ + error = xrep_iunlink_mark_incore(ragi); + if (error) + return error; + + /* + * If there are ondisk inodes that are unlinked and are not been loaded + * into cache, record them in iunlink_bmp. + */ + xrep_iunlink_mark_ondisk(ragi); + + /* + * Walk each iunlink bucket to (re)construct as much of the incore list + * as would be correct. For each inode that survives this step, mark + * it clear in iunlink_bmp; we're done with those inodes. + */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + error = xrep_iunlink_resolve_bucket(ragi, i); + if (error) + return error; + } + + /* + * Any unlinked inodes that we didn't find through the bucket list + * walk (or was ignored by the walk) must be inserted into the bucket + * list. Stage this in memory for now. + */ + return xagino_bitmap_walk(&ragi->iunlink_bmp, + xrep_iunlink_add_lost_inodes, ragi); +} + +/* Update i_next_iunlinked for the inode @agino. */ +STATIC int +xrep_iunlink_relink_next( + struct xrep_agi *ragi, + xfarray_idx_t idx, + xfs_agino_t next_agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_inode *ip; + xfarray_idx_t agino = idx - 1; + bool want_rele = false; + int error = 0; + + ip = xfs_iunlink_lookup(pag, agino); + if (!ip) { + xfs_ino_t ino; + xfs_agino_t prev_agino; + + /* + * No inode exists in cache. Load it off the disk so that we + * can reinsert it into the incore unlinked list. + */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + error = xchk_iget(sc, ino, &ip); + if (error) + return -EFSCORRUPTED; + + want_rele = true; + + /* Set the backward pointer since this just came off disk. */ + error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_prev(ip, prev_agino); + ip->i_prev_unlinked = prev_agino; + } + + /* Update the forward pointer. */ + if (ip->i_next_unlinked != next_agino) { + error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_next(ip, next_agino); + ip->i_next_unlinked = next_agino; + } + +out_rele: + /* + * The iunlink lookup doesn't igrab because we hold the AGI buffer lock + * and the inode cannot be reclaimed. However, if we used iget to load + * a missing inode, we must irele it here. + */ + if (want_rele) + xchk_irele(sc, ip); + return error; +} + +/* Update i_prev_iunlinked for the inode @agino. */ +STATIC int +xrep_iunlink_relink_prev( + struct xrep_agi *ragi, + xfarray_idx_t idx, + xfs_agino_t prev_agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_inode *ip; + xfarray_idx_t agino = idx - 1; + bool want_rele = false; + int error = 0; + + ASSERT(prev_agino != 0); + + ip = xfs_iunlink_lookup(pag, agino); + if (!ip) { + xfs_ino_t ino; + xfs_agino_t next_agino; + + /* + * No inode exists in cache. Load it off the disk so that we + * can reinsert it into the incore unlinked list. + */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + error = xchk_iget(sc, ino, &ip); + if (error) + return -EFSCORRUPTED; + + want_rele = true; + + /* Set the forward pointer since this just came off disk. */ + error = xfarray_load(ragi->iunlink_prev, agino, &next_agino); + if (error) + goto out_rele; + + error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_next(ip, next_agino); + ip->i_next_unlinked = next_agino; + } + + /* Update the backward pointer. */ + if (ip->i_prev_unlinked != prev_agino) { + trace_xrep_iunlink_relink_prev(ip, prev_agino); + ip->i_prev_unlinked = prev_agino; + } + +out_rele: + /* + * The iunlink lookup doesn't igrab because we hold the AGI buffer lock + * and the inode cannot be reclaimed. However, if we used iget to load + * a missing inode, we must irele it here. + */ + if (want_rele) + xchk_irele(sc, ip); + return error; +} + +/* Log all the iunlink updates we need to finish regenerating the AGI. */ +STATIC int +xrep_iunlink_commit( + struct xrep_agi *ragi) +{ + struct xfs_agi *agi = ragi->agi_bp->b_addr; + xfarray_idx_t idx = XFARRAY_CURSOR_INIT; + xfs_agino_t agino; + unsigned int i; + int error; + + /* Fix all the forward links */ + while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) { + error = xrep_iunlink_relink_next(ragi, idx, agino); + if (error) + return error; + } + + /* Fix all the back links */ + idx = XFARRAY_CURSOR_INIT; + while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) { + error = xrep_iunlink_relink_prev(ragi, idx, agino); + if (error) + return error; + } + + /* Copy the staged iunlink buckets to the new AGI. */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i, + be32_to_cpu(ragi->old_agi.agi_unlinked[i]), + ragi->iunlink_heads[i]); + + agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]); + } + + return 0; +} + /* Trigger reinitialization of the in-core data. */ STATIC int xrep_agi_commit_new( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; struct xfs_perag *pag; struct xfs_agi *agi = agi_bp->b_addr; @@ -956,33 +1712,58 @@ xrep_agi_commit_new( /* Repair the AGI. */ int xrep_agi( - struct xfs_scrub *sc) + struct xfs_scrub *sc) { - struct xrep_find_ag_btree fab[XREP_AGI_MAX] = { - [XREP_AGI_INOBT] = { - .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, - .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, - }, - [XREP_AGI_FINOBT] = { - .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_finobt_buf_ops, - .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, - }, - [XREP_AGI_END] = { - .buf_ops = NULL - }, - }; - struct xfs_agi old_agi; - struct xfs_mount *mp = sc->mp; - struct xfs_buf *agi_bp; - struct xfs_agi *agi; - int error; + struct xrep_agi *ragi; + struct xfs_mount *mp = sc->mp; + char *descr; + unsigned int i; + int error; /* We require the rmapbt to rebuild anything. */ if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; + sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + ragi = sc->buf; + ragi->sc = sc; + + ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){ + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_inobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }; + ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){ + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_finobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }; + ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){ + .buf_ops = NULL, + }; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ragi->iunlink_heads[i] = NULLAGINO; + + xagino_bitmap_init(&ragi->iunlink_bmp); + sc->buf_cleanup = xrep_agi_buf_cleanup; + + descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers"); + error = xfarray_create(descr, 0, sizeof(xfs_agino_t), + &ragi->iunlink_next); + kfree(descr); + if (error) + return error; + + descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers"); + error = xfarray_create(descr, 0, sizeof(xfs_agino_t), + &ragi->iunlink_prev); + kfree(descr); + if (error) + return error; + /* * Make sure we have the AGI buffer, as scrub might have decided it * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED. @@ -990,14 +1771,17 @@ xrep_agi( error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL); if (error) return error; - agi_bp->b_ops = &xfs_agi_buf_ops; - agi = agi_bp->b_addr; + ragi->agi_bp->b_ops = &xfs_agi_buf_ops; /* Find the AGI btree roots. */ - error = xrep_agi_find_btrees(sc, fab); + error = xrep_agi_find_btrees(ragi); + if (error) + return error; + + error = xrep_iunlink_rebuild_buckets(ragi); if (error) return error; @@ -1006,18 +1790,21 @@ xrep_agi( return error; /* Start rewriting the header and implant the btrees we found. */ - xrep_agi_init_header(sc, agi_bp, &old_agi); - xrep_agi_set_roots(sc, agi, fab); - error = xrep_agi_calc_from_btrees(sc, agi_bp); + xrep_agi_init_header(ragi); + xrep_agi_set_roots(ragi); + error = xrep_agi_calc_from_btrees(ragi); + if (error) + goto out_revert; + error = xrep_iunlink_commit(ragi); if (error) goto out_revert; /* Reinitialize in-core state. */ - return xrep_agi_commit_new(sc, agi_bp); + return xrep_agi_commit_new(ragi); out_revert: /* Mark the incore AGI state stale and revert the AGI. */ clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate); - memcpy(agi, &old_agi, sizeof(old_agi)); + memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi)); return error; } diff --git a/fs/xfs/scrub/agino_bitmap.h b/fs/xfs/scrub/agino_bitmap.h new file mode 100644 index 000000000000..56d7db5f1699 --- /dev/null +++ b/fs/xfs/scrub/agino_bitmap.h @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_AGINO_BITMAP_H__ +#define __XFS_SCRUB_AGINO_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agino_t */ + +struct xagino_bitmap { + struct xbitmap32 aginobitmap; +}; + +static inline void xagino_bitmap_init(struct xagino_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->aginobitmap); +} + +static inline void xagino_bitmap_destroy(struct xagino_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->aginobitmap); +} + +static inline int xagino_bitmap_clear(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int len) +{ + return xbitmap32_clear(&bitmap->aginobitmap, agino, len); +} + +static inline int xagino_bitmap_set(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int len) +{ + return xbitmap32_set(&bitmap->aginobitmap, agino, len); +} + +static inline bool xagino_bitmap_test(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int *len) +{ + return xbitmap32_test(&bitmap->aginobitmap, agino, len); +} + +static inline int xagino_bitmap_walk(struct xagino_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->aginobitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_AGINO_BITMAP_H__ */ diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c index d421b253923e..30295898cc8a 100644 --- a/fs/xfs/scrub/alloc_repair.c +++ b/fs/xfs/scrub/alloc_repair.c @@ -778,7 +778,7 @@ xrep_abt_build_new_trees( error = xrep_bnobt_sort_records(ra); if (error) - return error; + goto err_levels; /* Load the free space by block number tree. */ ra->array_cur = XFARRAY_CURSOR_INIT; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 83c7feb38714..708334f9b2bd 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -10,16 +10,20 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_attr_sf.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/attr.h" +#include "scrub/listxattr.h" +#include "scrub/repair.h" /* Free the buffers linked from the xattr buffer. */ static void @@ -35,6 +39,8 @@ xchk_xattr_buf_cleanup( kvfree(ab->value); ab->value = NULL; ab->value_sz = 0; + kvfree(ab->name); + ab->name = NULL; } /* @@ -65,7 +71,7 @@ xchk_xattr_want_freemap( * reallocating the buffer if necessary. Buffer contents are not preserved * across a reallocation. */ -static int +int xchk_setup_xattr_buf( struct xfs_scrub *sc, size_t value_size) @@ -95,6 +101,12 @@ xchk_setup_xattr_buf( return -ENOMEM; } + if (xchk_could_repair(sc)) { + ab->name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS); + if (!ab->name) + return -ENOMEM; + } + resize_value: if (ab->value_sz >= value_size) return 0; @@ -121,6 +133,12 @@ xchk_setup_xattr( { int error; + if (xchk_could_repair(sc)) { + error = xrep_setup_xattr(sc); + if (error) + return error; + } + /* * We failed to get memory while checking attrs, so this time try to * get all the memory we're ever going to need. Allocate the buffer @@ -137,106 +155,105 @@ xchk_setup_xattr( /* Extended Attributes */ -struct xchk_xattr { - struct xfs_attr_list_context context; - struct xfs_scrub *sc; -}; - /* * Check that an extended attribute key can be looked up by hash. * - * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked) - * to call this function for every attribute key in an inode. Once - * we're here, we load the attribute value to see if any errors happen, - * or if we get more or less data than we expected. + * We use the extended attribute walk helper to call this function for every + * attribute key in an inode. Once we're here, we load the attribute value to + * see if any errors happen, or if we get more or less data than we expected. */ -static void -xchk_xattr_listent( - struct xfs_attr_list_context *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) +static int +xchk_xattr_actor( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) { struct xfs_da_args args = { - .op_flags = XFS_DA_OP_NOTIME, - .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK, - .geo = context->dp->i_mount->m_attr_geo, + .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = sc->mp->m_attr_geo, .whichfork = XFS_ATTR_FORK, - .dp = context->dp, + .dp = ip, .name = name, .namelen = namelen, - .hashval = xfs_da_hashname(name, namelen), - .trans = context->tp, + .trans = sc->tp, .valuelen = valuelen, + .owner = ip->i_ino, }; struct xchk_xattr_buf *ab; - struct xchk_xattr *sx; int error = 0; - sx = container_of(context, struct xchk_xattr, context); - ab = sx->sc->buf; + ab = sc->buf; - if (xchk_should_terminate(sx->sc, &error)) { - context->seen_enough = error; - return; + if (xchk_should_terminate(sc, &error)) + return error; + + if (attr_flags & ~XFS_ATTR_ONDISK_MASK) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } - if (flags & XFS_ATTR_INCOMPLETE) { + if (attr_flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ - xchk_ino_set_preen(sx->sc, context->dp->i_ino); - return; + xchk_ino_set_preen(sc, ip->i_ino); + return 0; } - /* Only one namespace bit allowed. */ - if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) { - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - goto fail_xref; + /* Does this name make sense? */ + if (!xfs_attr_namecheck(attr_flags, name, namelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } - /* Does this name make sense? */ - if (!xfs_attr_namecheck(name, namelen)) { - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - goto fail_xref; + /* Check parent pointer record. */ + if ((attr_flags & XFS_ATTR_PARENT) && + !xfs_parent_valuecheck(sc->mp, value, valuelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } /* - * Local xattr values are stored in the attr leaf block, so we don't - * need to retrieve the value from a remote block to detect corruption - * problems. + * Try to allocate enough memory to extract the attr value. If that + * doesn't work, return -EDEADLOCK as a signal to try again with a + * maximally sized buffer. */ - if (flags & XFS_ATTR_LOCAL) - goto fail_xref; + error = xchk_setup_xattr_buf(sc, valuelen); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + return error; /* - * Try to allocate enough memory to extrat the attr value. If that - * doesn't work, we overload the seen_enough variable to convey - * the error message back to the main scrub function. + * Parent pointers are matched on attr name and value, so we must + * supply the xfs_parent_rec here when confirming that the dabtree + * indexing works correctly. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen); - if (error == -ENOMEM) - error = -EDEADLOCK; - if (error) { - context->seen_enough = error; - return; - } + if (attr_flags & XFS_ATTR_PARENT) + memcpy(ab->value, value, valuelen); args.value = ab->value; + /* + * Get the attr value to ensure that lookup can find this attribute + * through the dabtree indexing and that remote value retrieval also + * works correctly. + */ + xfs_attr_sethash(&args); error = xfs_attr_get_ilocked(&args); /* ENODATA means the hash lookup failed and the attr is bad */ if (error == -ENODATA) error = -EFSCORRUPTED; - if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, + if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, args.blkno, &error)) - goto fail_xref; + return error; if (args.valuelen != valuelen) - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, - args.blkno); -fail_xref: - if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - context->seen_enough = 1; - return; + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + + return 0; } /* @@ -246,7 +263,7 @@ fail_xref: * Within a char, the lowest bit of the char represents the byte with * the smallest address */ -STATIC bool +bool xchk_xattr_set_map( struct xfs_scrub *sc, unsigned long *map, @@ -403,6 +420,17 @@ xchk_xattr_block( xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); hdrsize = xfs_attr3_leaf_hdr_size(leaf); + /* + * Empty xattr leaf blocks mapped at block 0 are probably a byproduct + * of a race between setxattr and a log shutdown. Anywhere else in the + * attr fork is a corruption. + */ + if (leafhdr.count == 0) { + if (blk->blkno == 0) + xchk_da_set_preen(ds, level); + else + xchk_da_set_corrupt(ds, level); + } if (leafhdr.usedbytes > mp->m_attr_geo->blksize) xchk_da_set_corrupt(ds, level); if (leafhdr.firstused > mp->m_attr_geo->blksize) @@ -411,6 +439,8 @@ xchk_xattr_block( xchk_da_set_corrupt(ds, level); if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize)) xchk_da_set_corrupt(ds, level); + if (leafhdr.holes) + xchk_da_set_preen(ds, level); if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; @@ -463,7 +493,6 @@ xchk_xattr_rec( xfs_dahash_t hash; int nameidx; int hdrsize; - unsigned int badflags; int error; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -493,10 +522,15 @@ xchk_xattr_rec( /* Retrieve the entry and check it. */ hash = be32_to_cpu(ent->hashval); - badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE | - XFS_ATTR_INCOMPLETE); - if ((ent->flags & badflags) != 0) + if (ent->flags & ~XFS_ATTR_ONDISK_MASK) { + xchk_da_set_corrupt(ds, level); + return 0; + } + if (!xfs_attr_check_namespace(ent->flags)) { xchk_da_set_corrupt(ds, level); + return 0; + } + if (ent->flags & XFS_ATTR_LOCAL) { lentry = (struct xfs_attr_leaf_name_local *) (((char *)bp->b_addr) + nameidx); @@ -504,7 +538,10 @@ xchk_xattr_rec( xchk_da_set_corrupt(ds, level); goto out; } - calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen); + calc_hash = xfs_attr_hashval(mp, ent->flags, lentry->nameval, + lentry->namelen, + lentry->nameval + lentry->namelen, + be16_to_cpu(lentry->valuelen)); } else { rentry = (struct xfs_attr_leaf_name_remote *) (((char *)bp->b_addr) + nameidx); @@ -512,7 +549,13 @@ xchk_xattr_rec( xchk_da_set_corrupt(ds, level); goto out; } - calc_hash = xfs_da_hashname(rentry->name, rentry->namelen); + if (ent->flags & XFS_ATTR_PARENT) { + xchk_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_attr_hashval(mp, ent->flags, rentry->name, + rentry->namelen, NULL, + be32_to_cpu(rentry->valuelen)); } if (calc_hash != hash) xchk_da_set_corrupt(ds, level); @@ -556,6 +599,15 @@ xchk_xattr_check_sf( break; } + /* + * Shortform entries do not set LOCAL or INCOMPLETE, so the + * only valid flag bits here are for namespaces. + */ + if (sfe->flags & ~XFS_ATTR_NSP_ONDISK_MASK) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)sfe - (char *)sf, sizeof(struct xfs_attr_sf_entry))) { @@ -588,16 +640,6 @@ int xchk_xattr( struct xfs_scrub *sc) { - struct xchk_xattr sx = { - .sc = sc, - .context = { - .dp = sc->ip, - .tp = sc->tp, - .resynch = 1, - .put_listent = xchk_xattr_listent, - .allow_incomplete = true, - }, - }; xfs_dablk_t last_checked = -1U; int error = 0; @@ -626,12 +668,6 @@ xchk_xattr( /* * Look up every xattr in this file by name and hash. * - * Use the backend implementation of xfs_attr_list to call - * xchk_xattr_listent on every attribute key in this inode. - * In other words, we use the same iterator/callback mechanism - * that listattr uses to scrub extended attributes, though in our - * _listent function, we check the value of the attribute. - * * The VFS only locks i_rwsem when modifying attrs, so keep all * three locks held because that's the only way to ensure we're * the only thread poking into the da btree. We traverse the da @@ -639,13 +675,9 @@ xchk_xattr( * iteration, which doesn't really follow the usual buffer * locking order. */ - error = xfs_attr_list_ilocked(&sx.context); + error = xchk_xattr_walk(sc, sc->ip, xchk_xattr_actor, NULL, NULL); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) return error; - /* Did our listent function try to return any errors? */ - if (sx.context.seen_enough < 0) - return sx.context.seen_enough; - return 0; } diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 48fd9402c432..7db58af56646 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -16,9 +16,16 @@ struct xchk_xattr_buf { /* Bitmap of free space in xattr leaf blocks. */ unsigned long *freemap; + /* Memory buffer used to hold salvaged xattr names. */ + unsigned char *name; + /* Memory buffer used to extract xattr values. */ void *value; size_t value_sz; }; +bool xchk_xattr_set_map(struct xfs_scrub *sc, unsigned long *map, + unsigned int start, unsigned int len); +int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size); + #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c new file mode 100644 index 000000000000..c7eb94069caf --- /dev/null +++ b/fs/xfs/scrub/attr_repair.c @@ -0,0 +1,1663 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_acl.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/attr.h" +#include "scrub/reap.h" +#include "scrub/attr_repair.h" + +/* + * Extended Attribute Repair + * ========================= + * + * We repair extended attributes by reading the attr leaf blocks looking for + * attributes entries that look salvageable (name passes verifiers, value can + * be retrieved, etc). Each extended attribute worth salvaging is stashed in + * memory, and the stashed entries are periodically replayed into a temporary + * file to constrain memory use. Batching the construction of the temporary + * extended attribute structure in this fashion reduces lock cycling of the + * file being repaired and the temporary file. + * + * When salvaging completes, the remaining stashed attributes are replayed to + * the temporary file. An atomic file contents exchange is used to commit the + * new xattr blocks to the file being repaired. This will disrupt attrmulti + * cursors. + */ + +struct xrep_xattr_key { + /* Cookie for retrieval of the xattr name. */ + xfblob_cookie name_cookie; + + /* Cookie for retrieval of the xattr value. */ + xfblob_cookie value_cookie; + + /* XFS_ATTR_* flags */ + int flags; + + /* Length of the value and name. */ + uint32_t valuelen; + uint16_t namelen; +}; + +/* + * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write + * them to the temp file. + */ +#define XREP_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_xattr { + struct xfs_scrub *sc; + + /* Information for exchanging attr fork mappings at the end. */ + struct xrep_tempexch tx; + + /* xattr keys */ + struct xfarray *xattr_records; + + /* xattr values */ + struct xfblob *xattr_blobs; + + /* Number of attributes that we are salvaging. */ + unsigned long long attrs_found; + + /* Can we flush stashed attrs to the tempfile? */ + bool can_flush; + + /* Did the live update fail, and hence the repair is now out of date? */ + bool live_update_aborted; + + /* Lock protecting parent pointer updates */ + struct mutex lock; + + /* Fixed-size array of xrep_xattr_pptr structures. */ + struct xfarray *pptr_recs; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* Hook to capture parent pointer updates. */ + struct xfs_dir_hook dhook; + + /* Scratch buffer for capturing parent pointers. */ + struct xfs_da_args pptr_args; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; +}; + +/* Create a parent pointer in the tempfile. */ +#define XREP_XATTR_PPTR_ADD (1) + +/* Remove a parent pointer from the tempfile. */ +#define XREP_XATTR_PPTR_REMOVE (2) + +/* A stashed parent pointer update. */ +struct xrep_xattr_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; + + /* XREP_XATTR_PPTR_{ADD,REMOVE} */ + uint8_t action; +}; + +/* Set up to recreate the extended attributes. */ +int +xrep_setup_xattr( + struct xfs_scrub *sc) +{ + if (xfs_has_parent(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + return xrep_tempfile_create(sc, S_IFREG); +} + +/* + * Decide if we want to salvage this attribute. We don't bother with + * incomplete or oversized keys or values. The @value parameter can be null + * for remote attrs. + */ +STATIC int +xrep_xattr_want_salvage( + struct xrep_xattr *rx, + unsigned int attr_flags, + const void *name, + int namelen, + const void *value, + int valuelen) +{ + if (attr_flags & XFS_ATTR_INCOMPLETE) + return false; + if (namelen > XATTR_NAME_MAX || namelen <= 0) + return false; + if (!xfs_attr_namecheck(attr_flags, name, namelen)) + return false; + if (valuelen > XATTR_SIZE_MAX || valuelen < 0) + return false; + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_valuecheck(rx->sc->mp, value, valuelen); + + return true; +} + +/* Allocate an in-core record to hold xattrs while we rebuild the xattr data. */ +STATIC int +xrep_xattr_salvage_key( + struct xrep_xattr *rx, + int flags, + unsigned char *name, + int namelen, + unsigned char *value, + int valuelen) +{ + struct xrep_xattr_key key = { + .valuelen = valuelen, + .flags = flags & XFS_ATTR_NSP_ONDISK_MASK, + }; + unsigned int i = 0; + int error = 0; + + if (xchk_should_terminate(rx->sc, &error)) + return error; + + /* + * Truncate the name to the first character that would trip namecheck. + * If we no longer have a name after that, ignore this attribute. + */ + if (flags & XFS_ATTR_PARENT) { + key.namelen = namelen; + + trace_xrep_xattr_salvage_pptr(rx->sc->ip, flags, name, + key.namelen, value, valuelen); + } else { + while (i < namelen && name[i] != 0) + i++; + if (i == 0) + return 0; + key.namelen = i; + + trace_xrep_xattr_salvage_rec(rx->sc->ip, flags, name, + key.namelen, valuelen); + } + + error = xfblob_store(rx->xattr_blobs, &key.name_cookie, name, + key.namelen); + if (error) + return error; + + error = xfblob_store(rx->xattr_blobs, &key.value_cookie, value, + key.valuelen); + if (error) + return error; + + error = xfarray_append(rx->xattr_records, &key); + if (error) + return error; + + rx->attrs_found++; + return 0; +} + +/* + * Record a shortform extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_sf_attr( + struct xrep_xattr *rx, + struct xfs_attr_sf_hdr *hdr, + struct xfs_attr_sf_entry *sfe) +{ + struct xfs_scrub *sc = rx->sc; + struct xchk_xattr_buf *ab = sc->buf; + unsigned char *name = sfe->nameval; + unsigned char *value = &sfe->nameval[sfe->namelen]; + + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)name - (char *)hdr, + sfe->namelen)) + return 0; + + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)value - (char *)hdr, + sfe->valuelen)) + return 0; + + if (!xrep_xattr_want_salvage(rx, sfe->flags, sfe->nameval, + sfe->namelen, value, sfe->valuelen)) + return 0; + + return xrep_xattr_salvage_key(rx, sfe->flags, sfe->nameval, + sfe->namelen, value, sfe->valuelen); +} + +/* + * Record a local format extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_local_attr( + struct xrep_xattr *rx, + struct xfs_attr_leaf_entry *ent, + unsigned int nameidx, + const char *buf_end, + struct xfs_attr_leaf_name_local *lentry) +{ + struct xchk_xattr_buf *ab = rx->sc->buf; + unsigned char *value; + unsigned int valuelen; + unsigned int namesize; + + /* + * Decode the leaf local entry format. If something seems wrong, we + * junk the attribute. + */ + value = &lentry->nameval[lentry->namelen]; + valuelen = be16_to_cpu(lentry->valuelen); + namesize = xfs_attr_leaf_entsize_local(lentry->namelen, valuelen); + if ((char *)lentry + namesize > buf_end) + return 0; + if (!xrep_xattr_want_salvage(rx, ent->flags, lentry->nameval, + lentry->namelen, value, valuelen)) + return 0; + if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize)) + return 0; + + /* Try to save this attribute. */ + return xrep_xattr_salvage_key(rx, ent->flags, lentry->nameval, + lentry->namelen, value, valuelen); +} + +/* + * Record a remote format extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_remote_attr( + struct xrep_xattr *rx, + struct xfs_attr_leaf_entry *ent, + unsigned int nameidx, + const char *buf_end, + struct xfs_attr_leaf_name_remote *rentry, + unsigned int ent_idx, + struct xfs_buf *leaf_bp) +{ + struct xchk_xattr_buf *ab = rx->sc->buf; + struct xfs_da_args args = { + .trans = rx->sc->tp, + .dp = rx->sc->ip, + .index = ent_idx, + .geo = rx->sc->mp->m_attr_geo, + .owner = rx->sc->ip->i_ino, + .attr_filter = ent->flags & XFS_ATTR_NSP_ONDISK_MASK, + .namelen = rentry->namelen, + .name = rentry->name, + .value = ab->value, + .valuelen = be32_to_cpu(rentry->valuelen), + }; + unsigned int namesize; + int error; + + /* + * Decode the leaf remote entry format. If something seems wrong, we + * junk the attribute. Note that we should never find a zero-length + * remote attribute value. + */ + namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); + if ((char *)rentry + namesize > buf_end) + return 0; + if (args.valuelen == 0 || + !xrep_xattr_want_salvage(rx, ent->flags, rentry->name, + rentry->namelen, NULL, args.valuelen)) + return 0; + if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize)) + return 0; + + /* + * Enlarge the buffer (if needed) to hold the value that we're trying + * to salvage from the old extended attribute data. + */ + error = xchk_setup_xattr_buf(rx->sc, args.valuelen); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + return error; + + /* Look up the remote value and stash it for reconstruction. */ + error = xfs_attr3_leaf_getvalue(leaf_bp, &args); + if (error || args.rmtblkno == 0) + goto err_free; + + error = xfs_attr_rmtval_get(&args); + if (error) + goto err_free; + + /* Try to save this attribute. */ + error = xrep_xattr_salvage_key(rx, ent->flags, rentry->name, + rentry->namelen, ab->value, args.valuelen); +err_free: + /* remote value was garbage, junk it */ + if (error == -EFSBADCRC || error == -EFSCORRUPTED) + error = 0; + return error; +} + +/* Extract every xattr key that we can from this attr fork block. */ +STATIC int +xrep_xattr_recover_leaf( + struct xrep_xattr *rx, + struct xfs_buf *bp) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_scrub *sc = rx->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + struct xfs_attr_leaf_entry *ent; + struct xfs_attr_leaf_entry *entries; + struct xchk_xattr_buf *ab = rx->sc->buf; + char *buf_end; + size_t off; + unsigned int nameidx; + unsigned int hdrsize; + int i; + int error = 0; + + bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize); + + /* Check the leaf header */ + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + hdrsize = xfs_attr3_leaf_hdr_size(leaf); + xchk_xattr_set_map(sc, ab->usedmap, 0, hdrsize); + entries = xfs_attr3_leaf_entryp(leaf); + + buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; + for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { + if (xchk_should_terminate(sc, &error)) + return error; + + /* Skip key if it conflicts with something else? */ + off = (char *)ent - (char *)leaf; + if (!xchk_xattr_set_map(sc, ab->usedmap, off, + sizeof(xfs_attr_leaf_entry_t))) + continue; + + /* Check the name information. */ + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < leafhdr.firstused || + nameidx >= mp->m_attr_geo->blksize) + continue; + + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = xfs_attr3_leaf_name_local(leaf, i); + error = xrep_xattr_salvage_local_attr(rx, ent, nameidx, + buf_end, lentry); + } else { + rentry = xfs_attr3_leaf_name_remote(leaf, i); + error = xrep_xattr_salvage_remote_attr(rx, ent, nameidx, + buf_end, rentry, i, bp); + } + if (error) + return error; + } + + return 0; +} + +/* Try to recover shortform attrs. */ +STATIC int +xrep_xattr_recover_sf( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + struct xchk_xattr_buf *ab = sc->buf; + struct xfs_attr_sf_hdr *hdr; + struct xfs_attr_sf_entry *sfe; + struct xfs_attr_sf_entry *next; + struct xfs_ifork *ifp; + unsigned char *end; + int i; + int error = 0; + + ifp = xfs_ifork_ptr(rx->sc->ip, XFS_ATTR_FORK); + hdr = ifp->if_data; + + bitmap_zero(ab->usedmap, ifp->if_bytes); + end = (unsigned char *)ifp->if_data + ifp->if_bytes; + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*hdr)); + + sfe = xfs_attr_sf_firstentry(hdr); + if ((unsigned char *)sfe > end) + return 0; + + for (i = 0; i < hdr->count; i++) { + if (xchk_should_terminate(sc, &error)) + return error; + + next = xfs_attr_sf_nextentry(sfe); + if ((unsigned char *)next > end) + break; + + if (xchk_xattr_set_map(sc, ab->usedmap, + (char *)sfe - (char *)hdr, + sizeof(struct xfs_attr_sf_entry))) { + /* + * No conflicts with the sf entry; let's save this + * attribute. + */ + error = xrep_xattr_salvage_sf_attr(rx, hdr, sfe); + if (error) + return error; + } + + sfe = next; + } + + return 0; +} + +/* + * Try to return a buffer of xattr data for a given physical extent. + * + * Because the buffer cache get function complains if it finds a buffer + * matching the block number but not matching the length, we must be careful to + * look for incore buffers (up to the maximum length of a remote value) that + * could be hiding anywhere in the physical range. If we find an incore + * buffer, we can pass that to the caller. Optionally, read a single block and + * pass that back. + * + * Note the subtlety that remote attr value blocks for which there is no incore + * buffer will be passed to the callback one block at a time. These buffers + * will not have any ops attached and must be staled to prevent aliasing with + * multiblock buffers once we drop the ILOCK. + */ +STATIC int +xrep_xattr_find_buf( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + xfs_extlen_t max_len, + bool can_read, + struct xfs_buf **bpp) +{ + struct xrep_bufscan scan = { + .daddr = XFS_FSB_TO_DADDR(mp, fsbno), + .max_sectors = xrep_bufscan_max_sectors(mp, max_len), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { + *bpp = bp; + return 0; + } + + if (!can_read) { + *bpp = NULL; + return 0; + } + + return xfs_buf_read(mp->m_ddev_targp, scan.daddr, XFS_FSB_TO_BB(mp, 1), + XBF_TRYLOCK, bpp, NULL); +} + +/* + * Deal with a buffer that we found during our walk of the attr fork. + * + * Attribute leaf and node blocks are simple -- they're a single block, so we + * can walk them one at a time and we never have to worry about discontiguous + * multiblock buffers like we do for directories. + * + * Unfortunately, remote attr blocks add a lot of complexity here. Each disk + * block is totally self contained, in the sense that the v5 header provides no + * indication that there could be more data in the next block. The incore + * buffers can span multiple blocks, though they never cross extent records. + * However, they don't necessarily start or end on an extent record boundary. + * Therefore, we need a special buffer find function to walk the buffer cache + * for us. + * + * The caller must hold the ILOCK on the file being repaired. We use + * XBF_TRYLOCK here to skip any locked buffer on the assumption that we don't + * own the block and don't want to hang the system on a potentially garbage + * buffer. + */ +STATIC int +xrep_xattr_recover_block( + struct xrep_xattr *rx, + xfs_dablk_t dabno, + xfs_fsblock_t fsbno, + xfs_extlen_t max_len, + xfs_extlen_t *actual_len) +{ + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + int error; + + error = xrep_xattr_find_buf(rx->sc->mp, fsbno, max_len, true, &bp); + if (error) + return error; + info = bp->b_addr; + *actual_len = XFS_BB_TO_FSB(rx->sc->mp, bp->b_length); + + trace_xrep_xattr_recover_leafblock(rx->sc->ip, dabno, + be16_to_cpu(info->magic)); + + /* + * If the buffer has the right magic number for an attr leaf block and + * passes a structure check (we don't care about checksums), salvage + * as much as we can from the block. */ + if (info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) && + xrep_buf_verify_struct(bp, &xfs_attr3_leaf_buf_ops) && + xfs_attr3_leaf_header_check(bp, rx->sc->ip->i_ino) == NULL) + error = xrep_xattr_recover_leaf(rx, bp); + + /* + * If the buffer didn't already have buffer ops set, it was read in by + * the _find_buf function and could very well be /part/ of a multiblock + * remote block. Mark it stale so that it doesn't hang around in + * memory to cause problems. + */ + if (bp->b_ops == NULL) + xfs_buf_stale(bp); + + xfs_buf_relse(bp); + return error; +} + +/* Insert one xattr key/value. */ +STATIC int +xrep_xattr_insert_rec( + struct xrep_xattr *rx, + const struct xrep_xattr_key *key) +{ + struct xfs_da_args args = { + .dp = rx->sc->tempip, + .attr_filter = key->flags, + .namelen = key->namelen, + .valuelen = key->valuelen, + .owner = rx->sc->ip->i_ino, + .geo = rx->sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + struct xchk_xattr_buf *ab = rx->sc->buf; + int error; + + /* + * Grab pointers to the scrub buffer so that we can use them to insert + * attrs into the temp file. + */ + args.name = ab->name; + args.value = ab->value; + + /* + * The attribute name is stored near the end of the in-core buffer, + * though we reserve one more byte to ensure null termination. + */ + ab->name[XATTR_NAME_MAX] = 0; + + error = xfblob_load(rx->xattr_blobs, key->name_cookie, ab->name, + key->namelen); + if (error) + return error; + + error = xfblob_free(rx->xattr_blobs, key->name_cookie); + if (error) + return error; + + error = xfblob_load(rx->xattr_blobs, key->value_cookie, args.value, + key->valuelen); + if (error) + return error; + + error = xfblob_free(rx->xattr_blobs, key->value_cookie); + if (error) + return error; + + ab->name[key->namelen] = 0; + + if (key->flags & XFS_ATTR_PARENT) { + trace_xrep_xattr_insert_pptr(rx->sc->tempip, key->flags, + ab->name, key->namelen, ab->value, + key->valuelen); + args.op_flags |= XFS_DA_OP_LOGGED; + } else { + trace_xrep_xattr_insert_rec(rx->sc->tempip, key->flags, + ab->name, key->namelen, key->valuelen); + } + + /* + * xfs_attr_set creates and commits its own transaction. If the attr + * already exists, we'll just drop it during the rebuild. + */ + xfs_attr_sethash(&args); + error = xfs_attr_set(&args, XFS_ATTRUPDATE_CREATE, false); + if (error == -EEXIST) + error = 0; + + return error; +} + +/* + * Periodically flush salvaged attributes to the temporary file. This is done + * to reduce the memory requirements of the xattr rebuild because files can + * contain millions of attributes. + */ +STATIC int +xrep_xattr_flush_stashed( + struct xrep_xattr *rx) +{ + xfarray_idx_t array_cur; + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and a scrub transaction + * that we use during xattr salvaging to avoid livelocking if there + * are cycles in the xattr structures. We hold ILOCK_EXCL on both + * the inode being repaired, though it is not ijoined to the scrub + * transaction. + * + * To constrain kernel memory use, we occasionally flush salvaged + * xattrs from the xfarray and xfblob structures into the temporary + * file in preparation for exchanging the xattr structures at the end. + * Updating the temporary file requires a transaction, so we commit the + * scrub transaction and drop the two ILOCKs so that xfs_attr_set can + * allocate whatever transaction it wants. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from modifying the damaged xattr data while we + * repair it. + */ + error = xrep_trans_commit(rx->sc); + if (error) + return error; + xchk_iunlock(rx->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify xattrs. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rx->sc); + if (error) + return error; + + /* Add all the salvaged attrs to the temporary file. */ + foreach_xfarray_idx(rx->xattr_records, array_cur) { + struct xrep_xattr_key key; + + error = xfarray_load(rx->xattr_records, array_cur, &key); + if (error) + return error; + + error = xrep_xattr_insert_rec(rx, &key); + if (error) + return error; + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rx->xattr_records); + xfblob_truncate(rx->xattr_blobs); + + xrep_tempfile_iounlock(rx->sc); + + /* Recreate the salvage transaction and relock the inode. */ + error = xchk_trans_alloc(rx->sc, 0); + if (error) + return error; + xchk_ilock(rx->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much xattr data in memory. */ +static inline bool +xrep_xattr_want_flush_stashed( + struct xrep_xattr *rx) +{ + unsigned long long bytes; + + if (!rx->can_flush) + return false; + + bytes = xfarray_bytes(rx->xattr_records) + + xfblob_bytes(rx->xattr_blobs); + return bytes > XREP_XATTR_MAX_STASH_BYTES; +} + +/* + * Did we observe rename changing parent pointer xattrs while we were flushing + * salvaged attrs? + */ +static inline bool +xrep_xattr_saw_pptr_conflict( + struct xrep_xattr *rx) +{ + bool ret; + + ASSERT(rx->can_flush); + + if (!xfs_has_parent(rx->sc->mp)) + return false; + + xfs_assert_ilocked(rx->sc->ip, XFS_ILOCK_EXCL); + + mutex_lock(&rx->lock); + ret = xfarray_bytes(rx->pptr_recs) > 0; + mutex_unlock(&rx->lock); + + return ret; +} + +/* + * Reset the entire repair state back to initial conditions, now that we've + * detected a parent pointer update to the attr structure while we were + * flushing salvaged attrs. See the locking notes in dir_repair.c for more + * information on why this is all necessary. + */ +STATIC int +xrep_xattr_full_reset( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + struct xfs_attr_sf_hdr *hdr; + struct xfs_ifork *ifp = &sc->tempip->i_af; + int error; + + trace_xrep_xattr_full_reset(sc->ip, sc->tempip); + + /* The temporary file's data fork had better not be in btree format. */ + if (sc->tempip->i_df.if_format == XFS_DINODE_FMT_BTREE) { + ASSERT(0); + return -EIO; + } + + /* + * We begin in transaction context with sc->ip ILOCKed but not joined + * to the transaction. To reset to the initial state, we must hold + * sc->ip's ILOCK to prevent rename from updating parent pointer + * information and the tempfile's ILOCK to clear its contents. + */ + xchk_iunlock(rx->sc, XFS_ILOCK_EXCL); + xrep_tempfile_ilock_both(sc); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* + * Free all the blocks of the attr fork of the temp file, and reset + * it back to local format. + */ + if (xfs_ifork_has_extents(&sc->tempip->i_af)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK); + if (error) + return error; + + ASSERT(ifp->if_bytes == 0); + ifp->if_format = XFS_DINODE_FMT_LOCAL; + xfs_idata_realloc(sc->tempip, sizeof(*hdr), XFS_ATTR_FORK); + } + + /* Reinitialize the attr fork to an empty shortform structure. */ + hdr = ifp->if_data; + memset(hdr, 0, sizeof(*hdr)); + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + /* + * Roll this transaction to commit our reset ondisk. The tempfile + * should no longer be joined to the transaction, so we drop its ILOCK. + * This should leave us in transaction context with sc->ip ILOCKed but + * not joined to the transaction. + */ + error = xrep_roll_trans(sc); + if (error) + return error; + xrep_tempfile_iunlock(sc); + + /* + * Erase any accumulated parent pointer updates now that we've erased + * the tempfile's attr fork. We're resetting the entire repair state + * back to where we were initially, except now we won't flush salvaged + * xattrs until the very end. + */ + mutex_lock(&rx->lock); + xfarray_truncate(rx->pptr_recs); + xfblob_truncate(rx->pptr_names); + mutex_unlock(&rx->lock); + + rx->can_flush = false; + rx->attrs_found = 0; + + ASSERT(xfarray_bytes(rx->xattr_records) == 0); + ASSERT(xfblob_bytes(rx->xattr_blobs) == 0); + return 0; +} + +/* Extract as many attribute keys and values as we can. */ +STATIC int +xrep_xattr_recover( + struct xrep_xattr *rx) +{ + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = rx->sc; + struct xfs_da_geometry *geo = sc->mp->m_attr_geo; + xfs_fileoff_t offset; + xfs_extlen_t len; + xfs_dablk_t dabno; + int nmap; + int error; + +restart: + /* + * Iterate each xattr leaf block in the attr fork to scan them for any + * attributes that we might salvage. + */ + for (offset = 0; + offset < XFS_MAX_FILEOFF; + offset = got.br_startoff + got.br_blockcount) { + nmap = 1; + error = xfs_bmapi_read(sc->ip, offset, XFS_MAX_FILEOFF - offset, + &got, &nmap, XFS_BMAPI_ATTRFORK); + if (error) + return error; + if (nmap != 1) + return -EFSCORRUPTED; + if (!xfs_bmap_is_written_extent(&got)) + continue; + + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += len) { + xfs_fileoff_t curr_offset = dabno - got.br_startoff; + xfs_extlen_t maxlen; + + if (xchk_should_terminate(rx->sc, &error)) + return error; + + maxlen = min_t(xfs_filblks_t, INT_MAX, + got.br_blockcount - curr_offset); + error = xrep_xattr_recover_block(rx, dabno, + curr_offset + got.br_startblock, + maxlen, &len); + if (error) + return error; + + if (xrep_xattr_want_flush_stashed(rx)) { + error = xrep_xattr_flush_stashed(rx); + if (error) + return error; + + if (xrep_xattr_saw_pptr_conflict(rx)) { + error = xrep_xattr_full_reset(rx); + if (error) + return error; + + goto restart; + } + } + } + } + + return 0; +} + +/* + * Reset the extended attribute fork to a state where we can start re-adding + * the salvaged attributes. + */ +STATIC int +xrep_xattr_fork_remove( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + struct xfs_attr_sf_hdr *hdr; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); + + /* + * If the data fork is in btree format, we can't change di_forkoff + * because we could run afoul of the rule that the data fork isn't + * supposed to be in btree format if there's enough space in the fork + * that it could have used extents format. Instead, reinitialize the + * attr fork to have a shortform structure with zero attributes. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) { + ifp->if_format = XFS_DINODE_FMT_LOCAL; + hdr = xfs_idata_realloc(ip, (int)sizeof(*hdr) - ifp->if_bytes, + XFS_ATTR_FORK); + hdr->count = 0; + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(sc->tp, ip, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + return 0; + } + + /* If we still have attr fork extents, something's wrong. */ + if (ifp->if_nextents != 0) { + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + unsigned int i = 0; + + xfs_emerg(sc->mp, + "inode 0x%llx attr fork still has %llu attr extents, format %d?!", + ip->i_ino, ifp->if_nextents, ifp->if_format); + for_each_xfs_iext(ifp, &icur, &irec) { + xfs_err(sc->mp, + "[%u]: startoff %llu startblock %llu blockcount %llu state %u", + i++, irec.br_startoff, + irec.br_startblock, irec.br_blockcount, + irec.br_state); + } + ASSERT(0); + return -EFSCORRUPTED; + } + + xfs_attr_fork_remove(ip, sc->tp); + return 0; +} + +/* + * Free all the attribute fork blocks of the file being repaired and delete the + * fork. The caller must ILOCK the scrub file and join it to the transaction. + * This function returns with the inode joined to a clean transaction. + */ +int +xrep_xattr_reset_fork( + struct xfs_scrub *sc) +{ + int error; + + trace_xrep_xattr_reset_fork(sc->ip, sc->ip); + + /* Unmap all the attr blocks. */ + if (xfs_ifork_has_extents(&sc->ip->i_af)) { + error = xrep_reap_ifork(sc, sc->ip, XFS_ATTR_FORK); + if (error) + return error; + } + + error = xrep_xattr_fork_remove(sc, sc->ip); + if (error) + return error; + + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + +/* + * Free all the attribute fork blocks of the temporary file and delete the attr + * fork. The caller must ILOCK the tempfile and join it to the transaction. + * This function returns with the inode joined to a clean scrub transaction. + */ +int +xrep_xattr_reset_tempfile_fork( + struct xfs_scrub *sc) +{ + int error; + + trace_xrep_xattr_reset_fork(sc->ip, sc->tempip); + + /* + * Wipe out the attr fork of the temp file so that regular inode + * inactivation won't trip over the corrupt attr fork. + */ + if (xfs_ifork_has_extents(&sc->tempip->i_af)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK); + if (error) + return error; + } + + return xrep_xattr_fork_remove(sc, sc->tempip); +} + +/* + * Find all the extended attributes for this inode by scraping them out of the + * attribute key blocks by hand, and flushing them into the temp file. + * When we're done, free the staging memory before exchanging the xattr + * structures to reduce memory usage. + */ +STATIC int +xrep_xattr_salvage_attributes( + struct xrep_xattr *rx) +{ + struct xfs_inode *ip = rx->sc->ip; + int error; + + /* Short format xattrs are easy! */ + if (rx->sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) { + error = xrep_xattr_recover_sf(rx); + if (error) + return error; + + return xrep_xattr_flush_stashed(rx); + } + + /* + * For non-inline xattr structures, the salvage function scans the + * buffer cache looking for potential attr leaf blocks. The scan + * requires the ability to lock any buffer found and runs independently + * of any transaction <-> buffer item <-> buffer linkage. Therefore, + * roll the transaction to ensure there are no buffers joined. We hold + * the ILOCK independently of the transaction. + */ + error = xfs_trans_roll(&rx->sc->tp); + if (error) + return error; + + error = xfs_iread_extents(rx->sc->tp, ip, XFS_ATTR_FORK); + if (error) + return error; + + error = xrep_xattr_recover(rx); + if (error) + return error; + + return xrep_xattr_flush_stashed(rx); +} + +/* + * Add this stashed incore parent pointer to the temporary file. The caller + * must hold the tempdir's IOLOCK, must not hold any ILOCKs, and must not be in + * transaction context. + */ +STATIC int +xrep_xattr_replay_pptr_update( + struct xrep_xattr *rx, + const struct xfs_name *xname, + struct xrep_xattr_pptr *pptr) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + switch (pptr->action) { + case XREP_XATTR_PPTR_ADD: + /* Create parent pointer. */ + trace_xrep_xattr_replay_parentadd(sc->tempip, xname, + &pptr->pptr_rec); + + error = xfs_parent_set(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rx->pptr_args); + ASSERT(error != -EEXIST); + return error; + case XREP_XATTR_PPTR_REMOVE: + /* Remove parent pointer. */ + trace_xrep_xattr_replay_parentremove(sc->tempip, xname, + &pptr->pptr_rec); + + error = xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rx->pptr_args); + ASSERT(error != -ENOATTR); + return error; + } + + ASSERT(0); + return -EIO; +} + +/* + * Flush stashed parent pointer updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the xattr rebuild, since + * files can have a lot of hardlinks and the fs can be busy. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile + * IOLOCK. + */ +STATIC int +xrep_xattr_replay_pptr_updates( + struct xrep_xattr *rx) +{ + xfarray_idx_t array_cur; + int error; + + mutex_lock(&rx->lock); + foreach_xfarray_idx(rx->pptr_recs, array_cur) { + struct xrep_xattr_pptr pptr; + + error = xfarray_load(rx->pptr_recs, array_cur, &pptr); + if (error) + goto out_unlock; + + error = xfblob_loadname(rx->pptr_names, pptr.name_cookie, + &rx->xname, pptr.namelen); + if (error) + goto out_unlock; + mutex_unlock(&rx->lock); + + error = xrep_xattr_replay_pptr_update(rx, &rx->xname, &pptr); + if (error) + return error; + + mutex_lock(&rx->lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rx->pptr_recs); + xfblob_truncate(rx->pptr_names); + mutex_unlock(&rx->lock); + return 0; +out_unlock: + mutex_unlock(&rx->lock); + return error; +} + +/* + * Remember that we want to create a parent pointer in the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_xattr_stash_parentadd( + struct xrep_xattr *rx, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_xattr_pptr pptr = { + .action = XREP_XATTR_PPTR_ADD, + .namelen = name->len, + }; + int error; + + trace_xrep_xattr_stash_parentadd(rx->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rx->pptr_recs, &pptr); +} + +/* + * Remember that we want to remove a parent pointer from the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_xattr_stash_parentremove( + struct xrep_xattr *rx, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_xattr_pptr pptr = { + .action = XREP_XATTR_PPTR_REMOVE, + .namelen = name->len, + }; + int error; + + trace_xrep_xattr_stash_parentremove(rx->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rx->pptr_recs, &pptr); +} + +/* + * Capture dirent updates being made by other threads. We will have to replay + * the parent pointer updates before exchanging attr forks. + */ +STATIC int +xrep_xattr_live_dirent_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_xattr *rx; + struct xfs_scrub *sc; + int error; + + rx = container_of(nb, struct xrep_xattr, dhook.dirent_hook.nb); + sc = rx->sc; + + /* + * This thread updated a dirent that points to the file that we're + * repairing, so stash the update for replay against the temporary + * file. + */ + if (p->ip->i_ino != sc->ip->i_ino) + return NOTIFY_DONE; + + mutex_lock(&rx->lock); + if (p->delta > 0) + error = xrep_xattr_stash_parentadd(rx, p->name, p->dp); + else + error = xrep_xattr_stash_parentremove(rx, p->name, p->dp); + if (error) + rx->live_update_aborted = true; + mutex_unlock(&rx->lock); + return NOTIFY_DONE; +} + +/* + * Prepare both inodes' attribute forks for an exchange. Promote the tempfile + * from short format to leaf format, and if the file being repaired has a short + * format attr fork, turn it into an empty extent list. + */ +STATIC int +xrep_xattr_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the tempfile's attributes are in shortform format, convert that + * to a single leaf extent so that we can use the atomic mapping + * exchange. + */ + if (temp_local) { + struct xfs_da_args args = { + .dp = sc->tempip, + .geo = sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .trans = sc->tp, + .total = 1, + .owner = sc->ip->i_ino, + }; + + error = xfs_attr_shortform_to_leaf(&args); + if (error) + return error; + + /* + * Roll the deferred log items to get us back to a clean + * transaction. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform attribute fork, convert + * that to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + } + + return 0; +} + +/* Exchange the temporary file's attribute fork with the one being repaired. */ +int +xrep_xattr_swap( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + bool ip_local, temp_local; + int error = 0; + + ip_local = sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_af.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both files have a local format attr fork and the rebuilt + * xattr data would fit in the repaired file's attr fork, just copy + * the contents from the tempfile and declare ourselves done. + */ + if (ip_local && temp_local) { + int forkoff; + int newsize; + + newsize = xfs_attr_sf_totsize(sc->tempip); + forkoff = xfs_attr_shortform_bytesfit(sc->ip, newsize); + if (forkoff > 0) { + sc->ip->i_forkoff = forkoff; + xrep_tempfile_copyout_local(sc, XFS_ATTR_FORK); + return 0; + } + } + + /* Otherwise, make sure both attr forks are in block-mapping mode. */ + error = xrep_xattr_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + return xrep_tempexch_contents(sc, tx); +} + +/* + * Finish replaying stashed parent pointer updates, allocate a transaction for + * exchanging extent mappings, and take the ILOCKs of both files before we + * commit the new extended attribute structure. + */ +STATIC int +xrep_xattr_finalize_tempfile( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx); + + /* + * Repair relies on the ILOCK to quiesce all possible xattr updates. + * Replay all queued parent pointer updates into the tempfile before + * exchanging the contents, even if that means dropping the ILOCKs and + * the transaction. + */ + do { + error = xrep_xattr_replay_pptr_updates(rx); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx); + if (error) + return error; + + if (xfarray_length(rx->pptr_recs) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* + * Exchange the new extended attribute data (which we created in the tempfile) + * with the file being repaired. + */ +STATIC int +xrep_xattr_rebuild_tree( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + /* + * If we didn't find any attributes to salvage, repair the file by + * zapping its attr fork. + */ + if (rx->attrs_found == 0) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xrep_xattr_reset_fork(sc); + if (error) + return error; + + goto forget_acls; + } + + trace_xrep_xattr_rebuild_tree(sc->ip, sc->tempip); + + /* + * Commit the repair transaction and drop the ILOCKs so that we can use + * the atomic file content exchange helper functions to compute the + * correct resource reservations. + * + * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent xattr + * modifications, but there's nothing to prevent userspace from reading + * the attributes until we're ready for the exchange operation. Reads + * will return -EIO without shutting down the fs, so we're ok with + * that. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK on the temporary file so that we can run xattr + * operations with the same locks held as we would for a normal file. + * We still hold sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rx->sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed parent pointer updates to the temp file. After this + * point, we're ready to exchange attr fork mappings. + */ + error = xrep_xattr_finalize_tempfile(rx); + if (error) + return error; + + /* + * Exchange the blocks mapped by the tempfile's attr fork with the file + * being repaired. The old attr blocks will then be attached to the + * tempfile, so reap its attr fork. + */ + error = xrep_xattr_swap(sc, &rx->tx); + if (error) + return error; + + error = xrep_xattr_reset_tempfile_fork(sc); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target file. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + +forget_acls: + /* Invalidate cached ACLs now that we've reloaded all the xattrs. */ + xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_FILE); + xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_DEFAULT); + return 0; +} + +/* Tear down all the incore scan stuff we created. */ +STATIC void +xrep_xattr_teardown( + struct xrep_xattr *rx) +{ + if (xfs_has_parent(rx->sc->mp)) + xfs_dir_hook_del(rx->sc->mp, &rx->dhook); + if (rx->pptr_names) + xfblob_destroy(rx->pptr_names); + if (rx->pptr_recs) + xfarray_destroy(rx->pptr_recs); + xfblob_destroy(rx->xattr_blobs); + xfarray_destroy(rx->xattr_records); + mutex_destroy(&rx->lock); + kfree(rx); +} + +/* Set up the filesystem scan so we can regenerate extended attributes. */ +STATIC int +xrep_xattr_setup_scan( + struct xfs_scrub *sc, + struct xrep_xattr **rxp) +{ + struct xrep_xattr *rx; + char *descr; + int max_len; + int error; + + rx = kzalloc(sizeof(struct xrep_xattr), XCHK_GFP_FLAGS); + if (!rx) + return -ENOMEM; + rx->sc = sc; + rx->can_flush = true; + rx->xname.name = rx->namebuf; + + mutex_init(&rx->lock); + + /* + * Allocate enough memory to handle loading local attr values from the + * xfblob data while flushing stashed attrs to the temporary file. + * We only realloc the buffer when salvaging remote attr values. + */ + max_len = xfs_attr_leaf_entsize_local_max(sc->mp->m_attr_geo->blksize); + error = xchk_setup_xattr_buf(rx->sc, max_len); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + goto out_rx; + + /* Set up some staging for salvaged attribute keys and values */ + descr = xchk_xfile_ino_descr(sc, "xattr keys"); + error = xfarray_create(descr, 0, sizeof(struct xrep_xattr_key), + &rx->xattr_records); + kfree(descr); + if (error) + goto out_rx; + + descr = xchk_xfile_ino_descr(sc, "xattr names"); + error = xfblob_create(descr, &rx->xattr_blobs); + kfree(descr); + if (error) + goto out_keys; + + if (xfs_has_parent(sc->mp)) { + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + + descr = xchk_xfile_ino_descr(sc, + "xattr retained parent pointer entries"); + error = xfarray_create(descr, 0, + sizeof(struct xrep_xattr_pptr), + &rx->pptr_recs); + kfree(descr); + if (error) + goto out_values; + + descr = xchk_xfile_ino_descr(sc, + "xattr retained parent pointer names"); + error = xfblob_create(descr, &rx->pptr_names); + kfree(descr); + if (error) + goto out_pprecs; + + xfs_dir_hook_setup(&rx->dhook, xrep_xattr_live_dirent_update); + error = xfs_dir_hook_add(sc->mp, &rx->dhook); + if (error) + goto out_ppnames; + } + + *rxp = rx; + return 0; +out_ppnames: + xfblob_destroy(rx->pptr_names); +out_pprecs: + xfarray_destroy(rx->pptr_recs); +out_values: + xfblob_destroy(rx->xattr_blobs); +out_keys: + xfarray_destroy(rx->xattr_records); +out_rx: + mutex_destroy(&rx->lock); + kfree(rx); + return error; +} + +/* + * Repair the extended attribute metadata. + * + * XXX: Remote attribute value buffers encompass the entire (up to 64k) buffer. + * The buffer cache in XFS can't handle aliased multiblock buffers, so this + * might misbehave if the attr fork is crosslinked with other filesystem + * metadata. + */ +int +xrep_xattr( + struct xfs_scrub *sc) +{ + struct xrep_xattr *rx = NULL; + int error; + + if (!xfs_inode_hasattr(sc->ip)) + return -ENOENT; + + /* The rmapbt is required to reap the old attr fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + error = xrep_xattr_setup_scan(sc, &rx); + if (error) + return error; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xrep_xattr_salvage_attributes(rx); + if (error) + goto out_scan; + + if (rx->live_update_aborted) { + error = -EIO; + goto out_scan; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_scan; + + error = xrep_xattr_rebuild_tree(rx); + if (error) + goto out_scan; + +out_scan: + xrep_xattr_teardown(rx); + return error; +} diff --git a/fs/xfs/scrub/attr_repair.h b/fs/xfs/scrub/attr_repair.h new file mode 100644 index 000000000000..979729bd4a5f --- /dev/null +++ b/fs/xfs/scrub/attr_repair.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ATTR_REPAIR_H__ +#define __XFS_SCRUB_ATTR_REPAIR_H__ + +struct xrep_tempexch; + +int xrep_xattr_swap(struct xfs_scrub *sc, struct xrep_tempexch *tx); +int xrep_xattr_reset_fork(struct xfs_scrub *sc); +int xrep_xattr_reset_tempfile_fork(struct xfs_scrub *sc); + +#endif /* __XFS_SCRUB_ATTR_REPAIR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 0cb8d43912a8..7ba35a7a7920 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -40,22 +40,23 @@ struct xbitmap64_node { * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll * forward-declare them anyway for clarity. */ -static inline void +static inline __maybe_unused void xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline void +static inline __maybe_unused void xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline struct xbitmap64_node * +static inline __maybe_unused struct xbitmap64_node * xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start, uint64_t last); -static inline struct xbitmap64_node * +static inline __maybe_unused struct xbitmap64_node * xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start, uint64_t last); INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t, - __bn_subtree_last, START, LAST, static inline, xbitmap64_tree) + __bn_subtree_last, START, LAST, static inline __maybe_unused, + xbitmap64_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ #define for_each_xbitmap64_extent(bn, bitmap) \ @@ -314,22 +315,23 @@ struct xbitmap32_node { * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll * forward-declare them anyway for clarity. */ -static inline void +static inline __maybe_unused void xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root); -static inline void +static inline __maybe_unused void xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root); -static inline struct xbitmap32_node * +static inline __maybe_unused struct xbitmap32_node * xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start, uint32_t last); -static inline struct xbitmap32_node * +static inline __maybe_unused struct xbitmap32_node * xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start, uint32_t last); INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t, - __bn_subtree_last, START, LAST, static inline, xbitmap32_tree) + __bn_subtree_last, START, LAST, static inline __maybe_unused, + xbitmap32_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ #define for_each_xbitmap32_extent(bn, bitmap) \ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 47a20cf5205f..1ad8ec63a7f4 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -31,6 +31,8 @@ #include "xfs_ag.h" #include "xfs_error.h" #include "xfs_quota.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -445,7 +447,7 @@ xchk_perag_read_headers( { int error; - error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); + error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; @@ -781,7 +783,7 @@ xchk_iget( { ASSERT(sc->tp != NULL); - return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); + return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp); } /* @@ -827,13 +829,13 @@ again: * in the iget cache miss path. */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp); xfs_perag_put(pag); if (error) return error; - error = xfs_iget(mp, tp, inum, - XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); + error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0, + ipp); if (error == -EAGAIN) { /* * The inode may be in core but temporarily unavailable and may @@ -1060,12 +1062,6 @@ xchk_irele( spin_lock(&VFS_I(ip)->i_lock); VFS_I(ip)->i_state &= ~I_DONTCACHE; spin_unlock(&VFS_I(ip)->i_lock); - } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { - /* - * If this is the last reference to the inode and the caller - * permits it, set DONTCACHE to avoid thrashing. - */ - d_mark_dontcache(VFS_I(ip)); } xfs_irele(ip); @@ -1202,27 +1198,12 @@ xchk_metadata_inode_subtype( struct xfs_scrub *sc, unsigned int scrub_type) { - __u32 smtype = sc->sm->sm_type; - unsigned int sick_mask = sc->sick_mask; + struct xfs_scrub_subord *sub; int error; - sc->sm->sm_type = scrub_type; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - break; - } - - sc->sick_mask = sick_mask; - sc->sm->sm_type = smtype; + sub = xchk_scrub_create_subord(sc, scrub_type); + error = sub->sc.ops->scrub(&sub->sc); + xchk_scrub_free_subord(sub); return error; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 89f7bbec887e..3d5f1f6b4b7b 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -6,31 +6,6 @@ #ifndef __XFS_SCRUB_COMMON_H__ #define __XFS_SCRUB_COMMON_H__ -/* - * We /could/ terminate a scrub/repair operation early. If we're not - * in a good place to continue (fatal signal, etc.) then bail out. - * Note that we're careful not to make any judgements about *error. - */ -static inline bool -xchk_should_terminate( - struct xfs_scrub *sc, - int *error) -{ - /* - * If preemption is disabled, we need to yield to the scheduler every - * few seconds so that we don't run afoul of the soft lockup watchdog - * or RCU stall detector. - */ - cond_resched(); - - if (fatal_signal_pending(current)) { - if (*error == 0) - *error = -EINTR; - return true; - } - return false; -} - int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); int xchk_trans_alloc_empty(struct xfs_scrub *sc); void xchk_trans_cancel(struct xfs_scrub *sc); @@ -92,6 +67,7 @@ int xchk_setup_directory(struct xfs_scrub *sc); int xchk_setup_xattr(struct xfs_scrub *sc); int xchk_setup_symlink(struct xfs_scrub *sc); int xchk_setup_parent(struct xfs_scrub *sc); +int xchk_setup_dirtree(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_setup_rtbitmap(struct xfs_scrub *sc); int xchk_setup_rtsummary(struct xfs_scrub *sc); @@ -212,6 +188,7 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) } bool xchk_dir_looks_zapped(struct xfs_inode *dp); +bool xchk_pptr_looks_zapped(struct xfs_inode *ip); #ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ diff --git a/fs/xfs/scrub/dab_bitmap.h b/fs/xfs/scrub/dab_bitmap.h new file mode 100644 index 000000000000..0c6e3aad4395 --- /dev/null +++ b/fs/xfs/scrub/dab_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_DAB_BITMAP_H__ +#define __XFS_SCRUB_DAB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_dablk_t */ + +struct xdab_bitmap { + struct xbitmap32 dabitmap; +}; + +static inline void xdab_bitmap_init(struct xdab_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->dabitmap); +} + +static inline void xdab_bitmap_destroy(struct xdab_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->dabitmap); +} + +static inline int xdab_bitmap_set(struct xdab_bitmap *bitmap, + xfs_dablk_t dabno, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->dabitmap, dabno, len); +} + +static inline bool xdab_bitmap_test(struct xdab_bitmap *bitmap, + xfs_dablk_t dabno, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->dabitmap, dabno, len); +} + +#endif /* __XFS_SCRUB_DAB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 82b150d3b8b7..056de4819f86 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -78,6 +78,22 @@ xchk_da_set_corrupt( __return_address); } +/* Flag a da btree node in need of optimization. */ +void +xchk_da_set_preen( + struct xchk_da_btree *ds, + int level) +{ + struct xfs_scrub *sc = ds->sc; + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xchk_fblock_preen(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + __return_address); +} + +/* Find an entry at a certain level in a da btree. */ static struct xfs_da_node_entry * xchk_da_btree_node_entry( struct xchk_da_btree *ds, @@ -320,6 +336,7 @@ xchk_da_btree_block( struct xfs_da3_blkinfo *hdr3; struct xfs_da_args *dargs = &ds->dargs; struct xfs_inode *ip = ds->dargs.dp; + xfs_failaddr_t fa; xfs_ino_t owner; int *pmaxrecs; struct xfs_da3_icnode_hdr nodehdr; @@ -442,6 +459,12 @@ xchk_da_btree_block( goto out_freebp; } + fa = xfs_da3_header_check(blk->bp, dargs->owner); + if (fa) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + /* * If we've been handed a block that is below the dabtree root, does * its hashval match what the parent block expected to see? @@ -494,6 +517,7 @@ xchk_da_btree( ds->dargs.whichfork = whichfork; ds->dargs.trans = sc->tp; ds->dargs.op_flags = XFS_DA_OP_OKNOENT; + ds->dargs.owner = sc->ip->i_ino; ds->state = xfs_da_state_alloc(&ds->dargs); ds->sc = sc; ds->private = private; diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index 4f8c2138a1ec..de291e3b77dd 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -35,6 +35,9 @@ bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error); /* Check for da btree corruption. */ void xchk_da_set_corrupt(struct xchk_da_btree *ds, int level); +void xchk_da_set_preen(struct xchk_da_btree *ds, int level); + +void xchk_da_set_preen(struct xchk_da_btree *ds, int level); int xchk_da_btree_hash(struct xchk_da_btree *ds, int level, __be32 *hashp); int xchk_da_btree(struct xfs_scrub *sc, int whichfork, diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 076a310b8eb0..bf9199e8df63 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -16,22 +16,70 @@ #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_health.h" +#include "xfs_attr.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/readdir.h" #include "scrub/health.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" /* Set us up to scrub directories. */ int xchk_setup_directory( struct xfs_scrub *sc) { + int error; + + if (xchk_could_repair(sc)) { + error = xrep_setup_directory(sc); + if (error) + return error; + } + return xchk_setup_inode_contents(sc, 0); } /* Directories */ +/* Deferred directory entry that we saved for later. */ +struct xchk_dirent { + /* Cookie for retrieval of the dirent name. */ + xfblob_cookie name_cookie; + + /* Child inode number. */ + xfs_ino_t ino; + + /* Length of the pptr name. */ + uint8_t namelen; +}; + +struct xchk_dir { + struct xfs_scrub *sc; + + /* information for parent pointer validation. */ + struct xfs_parent_rec pptr_rec; + struct xfs_da_args pptr_args; + + /* Fixed-size array of xchk_dirent structures. */ + struct xfarray *dir_entries; + + /* Blobs containing dirent names. */ + struct xfblob *dir_names; + + /* If we've cycled the ILOCK, we must revalidate deferred dirents. */ + bool need_revalidate; + + /* Name buffer for dirent revalidation. */ + struct xfs_name xname; + uint8_t namebuf[MAXNAMELEN]; +}; + /* Scrub a directory entry. */ /* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */ @@ -55,6 +103,108 @@ xchk_dir_check_ftype( } /* + * Try to lock a child file for checking parent pointers. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_dir_lock_child( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } + + if (!xfs_inode_has_attr_fork(ip) || !xfs_need_iread_extents(&ip->i_af)) + return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED; + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } + + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; +} + +/* Check the backwards link (parent pointer) associated with this dirent. */ +STATIC int +xchk_dir_parent_pointer( + struct xchk_dir *sd, + const struct xfs_name *name, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = sd->sc; + int error; + + xfs_inode_to_parent_rec(&sd->pptr_rec, sc->ip); + error = xfs_parent_lookup(sc->tp, ip, name, &sd->pptr_rec, + &sd->pptr_args); + if (error == -ENOATTR) + xchk_fblock_xref_set_corrupt(sc, XFS_DATA_FORK, 0); + + return 0; +} + +/* Look for a parent pointer matching this dirent, if the child isn't busy. */ +STATIC int +xchk_dir_check_pptr_fast( + struct xchk_dir *sd, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = sd->sc; + unsigned int lockmode; + int error; + + /* dot and dotdot entries do not have parent pointers */ + if (xfs_dir2_samename(name, &xfs_name_dot) || + xfs_dir2_samename(name, &xfs_name_dotdot)) + return 0; + + /* No self-referential non-dot or dotdot dirents. */ + if (ip == sc->ip) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return -ECANCELED; + } + + /* Try to lock the inode. */ + lockmode = xchk_dir_lock_child(sc, ip); + if (!lockmode) { + struct xchk_dirent save_de = { + .namelen = name->len, + .ino = ip->i_ino, + }; + + /* Couldn't lock the inode, so save the dirent for later. */ + trace_xchk_dir_defer(sc->ip, name, ip->i_ino); + + error = xfblob_storename(sd->dir_names, &save_de.name_cookie, + name); + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + + error = xfarray_append(sd->dir_entries, &save_de); + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + + return 0; + } + + error = xchk_dir_parent_pointer(sd, name, ip); + xfs_iunlock(ip, lockmode); + return error; +} + +/* * Scrub a single directory entry. * * Check the inode number to make sure it's sane, then we check that we can @@ -71,6 +221,7 @@ xchk_dir_actor( { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip; + struct xchk_dir *sd = priv; xfs_ino_t lookup_ino; xfs_dablk_t offset; int error = 0; @@ -137,6 +288,14 @@ xchk_dir_actor( goto out; xchk_dir_check_ftype(sc, offset, ip, name->type); + + if (xfs_has_parent(mp)) { + error = xchk_dir_check_pptr_fast(sd, dapos, name, ip); + if (error) + goto out_rele; + } + +out_rele: xchk_irele(sc, ip); out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -196,8 +355,8 @@ xchk_dir_rec( xchk_da_set_corrupt(ds, level); goto out; } - error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, - XFS_DABUF_MAP_HOLE_OK, &bp); + error = xfs_dir3_data_read(ds->dargs.trans, dp, ds->dargs.owner, + rec_bno, XFS_DABUF_MAP_HOLE_OK, &bp); if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno, &error)) goto out; @@ -315,10 +474,11 @@ xchk_directory_data_bestfree( /* dir block format */ if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); - error = xfs_dir3_block_read(sc->tp, sc->ip, &bp); + error = xfs_dir3_block_read(sc->tp, sc->ip, sc->ip->i_ino, &bp); } else { /* dir data format */ - error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp); + error = xfs_dir3_data_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, + 0, &bp); } if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -470,7 +630,7 @@ xchk_directory_leaf1_bestfree( int error; /* Read the free space block. */ - error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); + error = xfs_dir3_leaf_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) return error; xchk_buffer_recheck(sc, bp); @@ -531,10 +691,9 @@ xchk_directory_leaf1_bestfree( /* Check all the bestfree entries. */ for (i = 0; i < bestcount; i++, bestp++) { best = be16_to_cpu(*bestp); - error = xfs_dir3_data_read(sc->tp, sc->ip, + error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner, xfs_dir2_db_to_da(args->geo, i), - XFS_DABUF_MAP_HOLE_OK, - &dbp); + XFS_DABUF_MAP_HOLE_OK, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) break; @@ -577,7 +736,7 @@ xchk_directory_free_bestfree( int error; /* Read the free space block */ - error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); + error = xfs_dir2_free_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) return error; xchk_buffer_recheck(sc, bp); @@ -597,7 +756,7 @@ xchk_directory_free_bestfree( stale++; continue; } - error = xfs_dir3_data_read(sc->tp, sc->ip, + error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner, (freehdr.firstdb + i) * args->geo->fsbcount, 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, @@ -621,10 +780,11 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args = { - .dp = sc ->ip, + .dp = sc->ip, .whichfork = XFS_DATA_FORK, .geo = sc->mp->m_dir_geo, .trans = sc->tp, + .owner = sc->ip->i_ino, }; struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; @@ -648,7 +808,8 @@ xchk_directory_blocks( free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); /* Is this a block dir? */ - error = xfs_dir2_isblock(&args, &is_block); + if (xfs_dir2_format(&args, &error) == XFS_DIR2_FMT_BLOCK) + is_block = true; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -752,11 +913,148 @@ out: return error; } +/* + * Revalidate a dirent that we collected in the past but couldn't check because + * of lock contention. Returns 0 if the dirent is still valid, -ENOENT if it + * has gone away on us, or a negative errno. + */ +STATIC int +xchk_dir_revalidate_dirent( + struct xchk_dir *sd, + const struct xfs_name *xname, + xfs_ino_t ino) +{ + struct xfs_scrub *sc = sd->sc; + xfs_ino_t child_ino; + int error; + + /* + * Look up the directory entry. If we get -ENOENT, the directory entry + * went away and there's nothing to revalidate. Return any other + * error. + */ + error = xchk_dir_lookup(sc, sc->ip, xname, &child_ino); + if (error) + return error; + + /* The inode number changed, nothing to revalidate. */ + if (ino != child_ino) + return -ENOENT; + + return 0; +} + +/* + * Check a directory entry's parent pointers the slow way, which means we cycle + * locks a bunch and put up with revalidation until we get it done. + */ +STATIC int +xchk_dir_slow_dirent( + struct xchk_dir *sd, + struct xchk_dirent *dirent, + const struct xfs_name *xname) +{ + struct xfs_scrub *sc = sd->sc; + struct xfs_inode *ip; + unsigned int lockmode; + int error; + + /* Check that the deferred dirent still exists. */ + if (sd->need_revalidate) { + error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino); + if (error == -ENOENT) + return 0; + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + } + + error = xchk_iget(sc, dirent->ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + + /* + * If we can grab both IOLOCK and ILOCK of the alleged child, we can + * proceed with the validation. + */ + lockmode = xchk_dir_lock_child(sc, ip); + if (lockmode) { + trace_xchk_dir_slowpath(sc->ip, xname, ip->i_ino); + goto check_pptr; + } + + /* + * We couldn't lock the child file. Drop all the locks and try to + * get them again, one at a time. + */ + xchk_iunlock(sc, sc->ilock_flags); + sd->need_revalidate = true; + + trace_xchk_dir_ultraslowpath(sc->ip, xname, ip->i_ino); + + error = xchk_dir_trylock_for_pptrs(sc, ip, &lockmode); + if (error) + goto out_rele; + + /* Revalidate, since we just cycled the locks. */ + error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino); + if (error == -ENOENT) { + error = 0; + goto out_unlock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_unlock; + +check_pptr: + error = xchk_dir_parent_pointer(sd, xname, ip); +out_unlock: + xfs_iunlock(ip, lockmode); +out_rele: + xchk_irele(sc, ip); + return error; +} + +/* Check all the dirents that we deferred the first time around. */ +STATIC int +xchk_dir_finish_slow_dirents( + struct xchk_dir *sd) +{ + xfarray_idx_t array_cur; + int error; + + foreach_xfarray_idx(sd->dir_entries, array_cur) { + struct xchk_dirent dirent; + + if (sd->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + error = xfarray_load(sd->dir_entries, array_cur, &dirent); + if (error) + return error; + + error = xfblob_loadname(sd->dir_names, dirent.name_cookie, + &sd->xname, dirent.namelen); + if (error) + return error; + + error = xchk_dir_slow_dirent(sd, &dirent, &sd->xname); + if (error) + return error; + } + + return 0; +} + /* Scrub a whole directory. */ int xchk_directory( struct xfs_scrub *sc) { + struct xchk_dir *sd; int error; if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) @@ -789,9 +1087,60 @@ xchk_directory( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return 0; + sd = kvzalloc(sizeof(struct xchk_dir), XCHK_GFP_FLAGS); + if (!sd) + return -ENOMEM; + sd->sc = sc; + sd->xname.name = sd->namebuf; + + if (xfs_has_parent(sc->mp)) { + char *descr; + + /* + * Set up some staging memory for dirents that we can't check + * due to locking contention. + */ + descr = xchk_xfile_ino_descr(sc, "slow directory entries"); + error = xfarray_create(descr, 0, sizeof(struct xchk_dirent), + &sd->dir_entries); + kfree(descr); + if (error) + goto out_sd; + + descr = xchk_xfile_ino_descr(sc, "slow directory entry names"); + error = xfblob_create(descr, &sd->dir_names); + kfree(descr); + if (error) + goto out_entries; + } + /* Look up every name in this directory by hash. */ - error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); - if (error && error != -ECANCELED) + error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, sd); + if (error == -ECANCELED) + error = 0; + if (error) + goto out_names; + + if (xfs_has_parent(sc->mp)) { + error = xchk_dir_finish_slow_dirents(sd); + if (error == -ETIMEDOUT) { + /* Couldn't grab a lock, scrub was marked incomplete */ + error = 0; + goto out_names; + } + if (error) + goto out_names; + } + +out_names: + if (sd->dir_names) + xfblob_destroy(sd->dir_names); +out_entries: + if (sd->dir_entries) + xfarray_destroy(sd->dir_entries); +out_sd: + kvfree(sd); + if (error) return error; /* If the dir is clean, it is clearly not zapped. */ diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c new file mode 100644 index 000000000000..64679fe08446 --- /dev/null +++ b/fs/xfs/scrub/dir_repair.c @@ -0,0 +1,1958 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bmap_util.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_ag.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/iscan.h" +#include "scrub/readdir.h" +#include "scrub/reap.h" +#include "scrub/findparent.h" +#include "scrub/orphanage.h" +#include "scrub/listxattr.h" + +/* + * Directory Repair + * ================ + * + * We repair directories by reading the directory data blocks looking for + * directory entries that look salvageable (name passes verifiers, entry points + * to a valid allocated inode, etc). Each entry worth salvaging is stashed in + * memory, and the stashed entries are periodically replayed into a temporary + * directory to constrain memory use. Batching the construction of the + * temporary directory in this fashion reduces lock cycling of the directory + * being repaired and the temporary directory, and will later become important + * for parent pointer scanning. + * + * If parent pointers are enabled on this filesystem, we instead reconstruct + * the directory by visiting each parent pointer of each file in the filesystem + * and translating the relevant parent pointer records into dirents. In this + * case, it is advantageous to stash all directory entries created from parent + * pointers for a single child file before replaying them into the temporary + * directory. To save memory, the live filesystem scan reuses the findparent + * fields. Directory repair chooses either parent pointer scanning or + * directory entry salvaging, but not both. + * + * Directory entries added to the temporary directory do not elevate the link + * counts of the inodes found. When salvaging completes, the remaining stashed + * entries are replayed to the temporary directory. An atomic mapping exchange + * is used to commit the new directory blocks to the directory being repaired. + * This will disrupt readdir cursors. + * + * Locking Issues + * -------------- + * + * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on + * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects + * b's dotdot update. This is in contrast to every other dotdot update (link, + * remove, mkdir). If the repair code drops the ILOCK, it must either + * revalidate the dotdot entry or use dirent hooks to capture updates from + * other threads. + */ + +/* Create a dirent in the tempdir. */ +#define XREP_DIRENT_ADD (1) + +/* Remove a dirent from the tempdir. */ +#define XREP_DIRENT_REMOVE (2) + +/* Directory entry to be restored in the new directory. */ +struct xrep_dirent { + /* Cookie for retrieval of the dirent name. */ + xfblob_cookie name_cookie; + + /* Target inode number. */ + xfs_ino_t ino; + + /* Length of the dirent name. */ + uint8_t namelen; + + /* File type of the dirent. */ + uint8_t ftype; + + /* XREP_DIRENT_{ADD,REMOVE} */ + uint8_t action; +}; + +/* + * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names + * before we write them to the temp dir. + */ +#define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_dir { + struct xfs_scrub *sc; + + /* Fixed-size array of xrep_dirent structures. */ + struct xfarray *dir_entries; + + /* Blobs containing directory entry names. */ + struct xfblob *dir_names; + + /* Information for exchanging data forks at the end. */ + struct xrep_tempexch tx; + + /* Preallocated args struct for performing dir operations */ + struct xfs_da_args args; + + /* + * Information used to scan the filesystem to find the inumber of the + * dotdot entry for this directory. For directory salvaging when + * parent pointers are not enabled, we use the findparent_* functions + * on this object and access only the parent_ino field directly. + * + * When parent pointers are enabled, however, the pptr scanner uses the + * iscan, hooks, lock, and parent_ino fields of this object directly. + * @pscan.lock coordinates access to dir_entries, dir_names, + * parent_ino, subdirs, dirents, and args. This reduces the memory + * requirements of this structure. + */ + struct xrep_parent_scan_info pscan; + + /* + * Context information for attaching this directory to the lost+found + * if this directory does not have a parent. + */ + struct xrep_adoption adoption; + + /* How many subdirectories did we find? */ + uint64_t subdirs; + + /* How many dirents did we find? */ + unsigned int dirents; + + /* Should we move this directory to the orphanage? */ + bool needs_adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + unsigned char namebuf[MAXNAMELEN]; +}; + +/* Tear down all the incore stuff we created. */ +static void +xrep_dir_teardown( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd = sc->buf; + + xrep_findparent_scan_teardown(&rd->pscan); + xfblob_destroy(rd->dir_names); + xfarray_destroy(rd->dir_entries); +} + +/* Set up for a directory repair. */ +int +xrep_setup_directory( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + error = xrep_orphanage_try_create(sc); + if (error) + return error; + + error = xrep_tempfile_create(sc, S_IFDIR); + if (error) + return error; + + rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS); + if (!rd) + return -ENOMEM; + rd->sc = sc; + rd->xname.name = rd->namebuf; + sc->buf = rd; + + return 0; +} + +/* + * Look up the dotdot entry and confirm that it's really the parent. + * Returns NULLFSINO if we don't know what to do. + */ +static inline xfs_ino_t +xrep_dir_lookup_parent( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t ino; + int error; + + error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL); + if (error) + return NULLFSINO; + if (!xfs_verify_dir_ino(sc->mp, ino)) + return NULLFSINO; + + error = xrep_findparent_confirm(sc, &ino); + if (error) + return NULLFSINO; + + return ino; +} + +/* + * Look up '..' in the dentry cache and confirm that it's really the parent. + * Returns NULLFSINO if the dcache misses or if the hit is implausible. + */ +static inline xfs_ino_t +xrep_dir_dcache_parent( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t parent_ino; + int error; + + parent_ino = xrep_findparent_from_dcache(sc); + if (parent_ino == NULLFSINO) + return parent_ino; + + error = xrep_findparent_confirm(sc, &parent_ino); + if (error) + return NULLFSINO; + + return parent_ino; +} + +/* Try to find the parent of the directory being repaired. */ +STATIC int +xrep_dir_find_parent( + struct xrep_dir *rd) +{ + xfs_ino_t ino; + + ino = xrep_findparent_self_reference(rd->sc); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + ino = xrep_dir_dcache_parent(rd); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + ino = xrep_dir_lookup_parent(rd); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + /* + * A full filesystem scan is the last resort. On a busy filesystem, + * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means + * that we don't know what who the parent is, so we should return to + * userspace. + */ + return xrep_findparent_scan(&rd->pscan); +} + +/* + * Decide if we want to salvage this entry. We don't bother with oversized + * names or the dot entry. + */ +STATIC int +xrep_dir_want_salvage( + struct xrep_dir *rd, + const char *name, + int namelen, + xfs_ino_t ino) +{ + struct xfs_mount *mp = rd->sc->mp; + + /* No pointers to ourselves or to garbage. */ + if (ino == rd->sc->ip->i_ino) + return false; + if (!xfs_verify_dir_ino(mp, ino)) + return false; + + /* No weird looking names or dot entries. */ + if (namelen >= MAXNAMELEN || namelen <= 0) + return false; + if (namelen == 1 && name[0] == '.') + return false; + if (!xfs_dir2_namecheck(name, namelen)) + return false; + + return true; +} + +/* + * Remember that we want to create a dirent in the tempdir. These stashed + * actions will be replayed later. + */ +STATIC int +xrep_dir_stash_createname( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t ino) +{ + struct xrep_dirent dirent = { + .action = XREP_DIRENT_ADD, + .ino = ino, + .namelen = name->len, + .ftype = name->type, + }; + int error; + + trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino); + + error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); + if (error) + return error; + + return xfarray_append(rd->dir_entries, &dirent); +} + +/* + * Remember that we want to remove a dirent from the tempdir. These stashed + * actions will be replayed later. + */ +STATIC int +xrep_dir_stash_removename( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t ino) +{ + struct xrep_dirent dirent = { + .action = XREP_DIRENT_REMOVE, + .ino = ino, + .namelen = name->len, + .ftype = name->type, + }; + int error; + + trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino); + + error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); + if (error) + return error; + + return xfarray_append(rd->dir_entries, &dirent); +} + +/* Allocate an in-core record to hold entries while we rebuild the dir data. */ +STATIC int +xrep_dir_salvage_entry( + struct xrep_dir *rd, + unsigned char *name, + unsigned int namelen, + xfs_ino_t ino) +{ + struct xfs_name xname = { + .name = name, + }; + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *ip; + unsigned int i = 0; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Truncate the name to the first character that would trip namecheck. + * If we no longer have a name after that, ignore this entry. + */ + while (i < namelen && name[i] != 0 && name[i] != '/') + i++; + if (i == 0) + return 0; + xname.len = i; + + /* Ignore '..' entries; we already picked the new parent. */ + if (xname.len == 2 && name[0] == '.' && name[1] == '.') { + trace_xrep_dir_salvaged_parent(sc->ip, ino); + return 0; + } + + trace_xrep_dir_salvage_entry(sc->ip, &xname, ino); + + /* + * Compute the ftype or dump the entry if we can't. We don't lock the + * inode because inodes can't change type while we have a reference. + */ + error = xchk_iget(sc, ino, &ip); + if (error) + return 0; + + xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + xchk_irele(sc, ip); + + return xrep_dir_stash_createname(rd, &xname, ino); +} + +/* Record a shortform directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_sf_entry( + struct xrep_dir *rd, + struct xfs_dir2_sf_hdr *sfp, + struct xfs_dir2_sf_entry *sfep) +{ + xfs_ino_t ino; + + ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep); + if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino); +} + +/* Record a regular directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_data_entry( + struct xrep_dir *rd, + struct xfs_dir2_data_entry *dep) +{ + xfs_ino_t ino; + + ino = be64_to_cpu(dep->inumber); + if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino); +} + +/* Try to recover block/data format directory entries. */ +STATIC int +xrep_dir_recover_data( + struct xrep_dir *rd, + struct xfs_buf *bp) +{ + struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo; + unsigned int offset; + unsigned int end; + int error = 0; + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + offset = geo->data_entry_offset; + end = min_t(unsigned int, BBTOB(bp->b_length), + xfs_dir3_data_end_offset(geo, bp->b_addr)); + + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + + if (xchk_should_terminate(rd->sc, &error)) + return error; + + /* Skip unused entries. */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + offset += be16_to_cpu(dup->length); + continue; + } + + /* Don't walk off the end of the block. */ + offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen); + if (offset > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_data_entry(rd, dep); + if (error) + return error; + + } + + return 0; +} + +/* Try to recover shortform directory entries. */ +STATIC int +xrep_dir_recover_sf( + struct xrep_dir *rd) +{ + struct xfs_dir2_sf_hdr *hdr; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next; + struct xfs_ifork *ifp; + xfs_ino_t ino; + unsigned char *end; + int error = 0; + + ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK); + hdr = ifp->if_data; + end = (unsigned char *)ifp->if_data + ifp->if_bytes; + + ino = xfs_dir2_sf_get_parent_ino(hdr); + trace_xrep_dir_salvaged_parent(rd->sc->ip, ino); + + sfep = xfs_dir2_sf_firstentry(hdr); + while ((unsigned char *)sfep < end) { + if (xchk_should_terminate(rd->sc, &error)) + return error; + + next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep); + if ((unsigned char *)next > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_sf_entry(rd, hdr, sfep); + if (error) + return error; + + sfep = next; + } + + return 0; +} + +/* + * Try to figure out the format of this directory from the data fork mappings + * and the directory size. If we can be reasonably sure of format, we can be + * more aggressive in salvaging directory entries. On return, @magic_guess + * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format" + * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory, + * and 0 if we can't tell. + */ +STATIC void +xrep_dir_guess_format( + struct xrep_dir *rd, + __be32 *magic_guess) +{ + struct xfs_inode *dp = rd->sc->ip; + struct xfs_mount *mp = rd->sc->mp; + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_fileoff_t last; + int error; + + ASSERT(xfs_has_crc(mp)); + + *magic_guess = 0; + + /* + * If there's a single directory block and the directory size is + * exactly one block, this has to be a single block format directory. + */ + error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK); + if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize && + dp->i_disk_size == geo->blksize) { + *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return; + } + + /* + * If the last extent before the leaf offset matches the directory + * size and the directory size is larger than 1 block, this is a + * data format directory. + */ + last = geo->leafblk; + error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK); + if (!error && + XFS_FSB_TO_B(mp, last) > geo->blksize && + XFS_FSB_TO_B(mp, last) == dp->i_disk_size) { + *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + return; + } +} + +/* Recover directory entries from a specific directory block. */ +STATIC int +xrep_dir_recover_dirblock( + struct xrep_dir *rd, + __be32 magic_guess, + xfs_dablk_t dabno) +{ + struct xfs_dir2_data_hdr *hdr; + struct xfs_buf *bp; + __be32 oldmagic; + int error; + + /* + * Try to read buffer. We invalidate them in the next step so we don't + * bother to set a buffer type or ops. + */ + error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno, + XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL); + if (error || !bp) + return error; + + hdr = bp->b_addr; + oldmagic = hdr->magic; + + trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno, + be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess)); + + /* + * If we're sure of the block's format, proceed with the salvage + * operation using the specified magic number. + */ + if (magic_guess) { + hdr->magic = magic_guess; + goto recover; + } + + /* + * If we couldn't guess what type of directory this is, then we will + * only salvage entries from directory blocks that match the magic + * number and pass verifiers. + */ + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops)) + goto out; + if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL) + goto out; + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops)) + goto out; + if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL) + goto out; + break; + default: + goto out; + } + +recover: + error = xrep_dir_recover_data(rd, bp); + +out: + hdr->magic = oldmagic; + xfs_trans_brelse(rd->sc->tp, bp); + return error; +} + +static inline void +xrep_dir_init_args( + struct xrep_dir *rd, + struct xfs_inode *dp, + const struct xfs_name *name) +{ + memset(&rd->args, 0, sizeof(struct xfs_da_args)); + rd->args.geo = rd->sc->mp->m_dir_geo; + rd->args.whichfork = XFS_DATA_FORK; + rd->args.owner = rd->sc->ip->i_ino; + rd->args.trans = rd->sc->tp; + rd->args.dp = dp; + if (!name) + return; + rd->args.name = name->name; + rd->args.namelen = name->len; + rd->args.filetype = name->type; + rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name); +} + +/* Replay a stashed createname into the temporary directory. */ +STATIC int +xrep_dir_replay_createname( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t inum, + xfs_extlen_t total) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *dp = rd->sc->tempip; + int error; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + error = xfs_dir_ino_validate(sc->mp, inum); + if (error) + return error; + + trace_xrep_dir_replay_createname(dp, name, inum); + + xrep_dir_init_args(rd, dp, name); + rd->args.inumber = inum; + rd->args.total = total; + rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + return xfs_dir_createname_args(&rd->args); +} + +/* Replay a stashed removename onto the temporary directory. */ +STATIC int +xrep_dir_replay_removename( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_extlen_t total) +{ + struct xfs_inode *dp = rd->args.dp; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + xrep_dir_init_args(rd, dp, name); + rd->args.op_flags = 0; + rd->args.total = total; + + trace_xrep_dir_replay_removename(dp, name, 0); + return xfs_dir_removename_args(&rd->args); +} + +/* + * Add this stashed incore directory entry to the temporary directory. + * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and + * must not be in transaction context. + */ +STATIC int +xrep_dir_replay_update( + struct xrep_dir *rd, + const struct xfs_name *xname, + const struct xrep_dirent *dirent) +{ + struct xfs_mount *mp = rd->sc->mp; +#ifdef DEBUG + xfs_ino_t ino; +#endif + uint resblks; + int error; + + resblks = xfs_link_space_res(mp, xname->len); + error = xchk_trans_alloc(rd->sc, resblks); + if (error) + return error; + + /* Lock the temporary directory and join it to the transaction */ + xrep_tempfile_ilock(rd->sc); + xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0); + + switch (dirent->action) { + case XREP_DIRENT_ADD: + /* + * Create a replacement dirent in the temporary directory. + * Note that _createname doesn't check for existing entries. + * There shouldn't be any in the temporary dir, but we'll + * verify this in debug mode. + */ +#ifdef DEBUG + error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); + if (error != -ENOENT) { + ASSERT(error != -ENOENT); + goto out_cancel; + } +#endif + + error = xrep_dir_replay_createname(rd, xname, dirent->ino, + resblks); + if (error) + goto out_cancel; + + if (xname->type == XFS_DIR3_FT_DIR) + rd->subdirs++; + rd->dirents++; + break; + case XREP_DIRENT_REMOVE: + /* + * Remove a dirent from the temporary directory. Note that + * _removename doesn't check the inode target of the exist + * entry. There should be a perfect match in the temporary + * dir, but we'll verify this in debug mode. + */ +#ifdef DEBUG + error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); + if (error) { + ASSERT(error != 0); + goto out_cancel; + } + if (ino != dirent->ino) { + ASSERT(ino == dirent->ino); + error = -EIO; + goto out_cancel; + } +#endif + + error = xrep_dir_replay_removename(rd, xname, resblks); + if (error) + goto out_cancel; + + if (xname->type == XFS_DIR3_FT_DIR) + rd->subdirs--; + rd->dirents--; + break; + default: + ASSERT(0); + error = -EIO; + goto out_cancel; + } + + /* Commit and unlock. */ + error = xrep_trans_commit(rd->sc); + if (error) + return error; + + xrep_tempfile_iunlock(rd->sc); + return 0; +out_cancel: + xchk_trans_cancel(rd->sc); + xrep_tempfile_iunlock(rd->sc); + return error; +} + +/* + * Flush stashed incore dirent updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the directory rebuild, + * since directories can contain up to 32GB of directory data. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir + * IOLOCK. + */ +STATIC int +xrep_dir_replay_updates( + struct xrep_dir *rd) +{ + xfarray_idx_t array_cur; + int error; + + /* Add all the salvaged dirents to the temporary directory. */ + mutex_lock(&rd->pscan.lock); + foreach_xfarray_idx(rd->dir_entries, array_cur) { + struct xrep_dirent dirent; + + error = xfarray_load(rd->dir_entries, array_cur, &dirent); + if (error) + goto out_unlock; + + error = xfblob_loadname(rd->dir_names, dirent.name_cookie, + &rd->xname, dirent.namelen); + if (error) + goto out_unlock; + rd->xname.type = dirent.ftype; + mutex_unlock(&rd->pscan.lock); + + error = xrep_dir_replay_update(rd, &rd->xname, &dirent); + if (error) + return error; + mutex_lock(&rd->pscan.lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rd->dir_entries); + xfblob_truncate(rd->dir_names); + mutex_unlock(&rd->pscan.lock); + return 0; +out_unlock: + mutex_unlock(&rd->pscan.lock); + return error; +} + +/* + * Periodically flush stashed directory entries to the temporary dir. This + * is done to reduce the memory requirements of the directory rebuild, since + * directories can contain up to 32GB of directory data. + */ +STATIC int +xrep_dir_flush_stashed( + struct xrep_dir *rd) +{ + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and a scrub transaction + * that we use during dirent salvaging to avoid livelocking if there + * are cycles in the directory structures. We hold ILOCK_EXCL on both + * the inode being repaired and the temporary file, though they are + * not ijoined to the scrub transaction. + * + * To constrain kernel memory use, we occasionally write salvaged + * dirents from the xfarray and xfblob structures into the temporary + * directory in preparation for exchanging the directory structures at + * the end. Updating the temporary file requires a transaction, so we + * commit the scrub transaction and drop the two ILOCKs so that + * we can allocate whatever transaction we want. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from accessing the damaged directory data while we + * repair it. + */ + error = xrep_trans_commit(rd->sc); + if (error) + return error; + xchk_iunlock(rd->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify dirents. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rd->sc); + if (error) + return error; + + /* Write to the tempdir all the updates that we've stashed. */ + error = xrep_dir_replay_updates(rd); + xrep_tempfile_iounlock(rd->sc); + if (error) + return error; + + /* + * Recreate the salvage transaction and relock the dir we're salvaging. + */ + error = xchk_trans_alloc(rd->sc, 0); + if (error) + return error; + xchk_ilock(rd->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much dirent data in memory. */ +static inline bool +xrep_dir_want_flush_stashed( + struct xrep_dir *rd) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names); + return bytes > XREP_DIR_MAX_STASH_BYTES; +} + +/* Extract as many directory entries as we can. */ +STATIC int +xrep_dir_recover( + struct xrep_dir *rd) +{ + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = rd->sc; + struct xfs_da_geometry *geo = sc->mp->m_dir_geo; + xfs_fileoff_t offset; + xfs_dablk_t dabno; + __be32 magic_guess; + int nmap; + int error; + + xrep_dir_guess_format(rd, &magic_guess); + + /* Iterate each directory data block in the data fork. */ + for (offset = 0; + offset < geo->leafblk; + offset = got.br_startoff + got.br_blockcount) { + nmap = 1; + error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset, + &got, &nmap, 0); + if (error) + return error; + if (nmap != 1) + return -EFSCORRUPTED; + if (!xfs_bmap_is_written_extent(&got)) + continue; + + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += geo->fsbcount) { + if (xchk_should_terminate(rd->sc, &error)) + return error; + + error = xrep_dir_recover_dirblock(rd, + magic_guess, dabno); + if (error) + return error; + + /* Flush dirents to constrain memory usage. */ + if (xrep_dir_want_flush_stashed(rd)) { + error = xrep_dir_flush_stashed(rd); + if (error) + return error; + } + } + } + + return 0; +} + +/* + * Find all the directory entries for this inode by scraping them out of the + * directory leaf blocks by hand, and flushing them into the temp dir. + */ +STATIC int +xrep_dir_find_entries( + struct xrep_dir *rd) +{ + struct xfs_inode *dp = rd->sc->ip; + int error; + + /* + * Salvage directory entries from the old directory, and write them to + * the temporary directory. + */ + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + error = xrep_dir_recover_sf(rd); + } else { + error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xrep_dir_recover(rd); + } + if (error) + return error; + + return xrep_dir_flush_stashed(rd); +} + +/* Scan all files in the filesystem for dirents. */ +STATIC int +xrep_dir_salvage_entries( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + /* + * Drop the ILOCK on this directory so that we can scan for this + * directory's parent. Figure out who is going to be the parent of + * this directory, then retake the ILOCK so that we can salvage + * directory entries. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + error = xrep_dir_find_parent(rd); + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* + * Collect directory entries by parsing raw leaf blocks to salvage + * whatever we can. When we're done, free the staging memory before + * exchanging the directories to reduce memory usage. + */ + error = xrep_dir_find_entries(rd); + if (error) + return error; + + /* + * Cancel the repair transaction and drop the ILOCK so that we can + * (later) use the atomic mapping exchange functions to compute the + * correct block reservations and re-lock the inodes. + * + * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory + * modifications, but there's nothing to prevent userspace from reading + * the directory until we're ready for the exchange operation. Reads + * will return -EIO without shutting down the fs, so we're ok with + * that. + * + * The VFS can change dotdot on us, but the findparent scan will keep + * our incore parent inode up to date. See the note on locking issues + * for more details. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + return 0; +} + + +/* + * Examine a parent pointer of a file. If it leads us back to the directory + * that we're rebuilding, create an incore dirent from the parent pointer and + * stash it. + */ +STATIC int +xrep_dir_scan_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode), + }; + xfs_ino_t parent_ino; + uint32_t parent_gen; + struct xrep_dir *rd = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + /* + * Ignore parent pointers that point back to a different dir, list the + * wrong generation number, or are invalid. + */ + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, &parent_gen); + if (error) + return error; + + if (parent_ino != sc->ip->i_ino || + parent_gen != VFS_I(sc->ip)->i_generation) + return 0; + + mutex_lock(&rd->pscan.lock); + error = xrep_dir_stash_createname(rd, &xname, ip->i_ino); + mutex_unlock(&rd->pscan.lock); + return error; +} + +/* + * If this child dirent points to the directory being repaired, remember that + * fact so that we can reset the dotdot entry if necessary. + */ +STATIC int +xrep_dir_scan_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_dir *rd = priv; + + /* Dirent doesn't point to this directory. */ + if (ino != rd->sc->ip->i_ino) + return 0; + + /* Ignore garbage inum. */ + if (!xfs_verify_dir_ino(rd->sc->mp, ino)) + return 0; + + /* No weird looking names. */ + if (name->len >= MAXNAMELEN || name->len <= 0) + return 0; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot, + dp->i_ino); + + xrep_findparent_scan_found(&rd->pscan, dp->i_ino); + return 0; +} + +/* + * Decide if we want to look for child dirents or parent pointers in this file. + * Skip the dir being repaired and any files being used to stage repairs. + */ +static inline bool +xrep_dir_want_scan( + struct xrep_dir *rd, + const struct xfs_inode *ip) +{ + return ip != rd->sc->ip && !xrep_is_tempfile(ip); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or + * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_dir_scan_ilock( + struct xrep_dir *rd, + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* Need to take the shared ILOCK to advance the iscan cursor. */ + if (!xrep_dir_want_scan(rd, ip)) + goto lock; + + if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * Scan this file for relevant child dirents or parent pointers that point to + * the directory we're rebuilding. + */ +STATIC int +xrep_dir_scan_file( + struct xrep_dir *rd, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + lock_mode = xrep_dir_scan_ilock(rd, ip); + + if (!xrep_dir_want_scan(rd, ip)) + goto scan_done; + + /* + * If the extended attributes look as though they has been zapped by + * the inode record repair code, we cannot scan for parent pointers. + */ + if (xchk_pptr_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd); + if (error) + goto scan_done; + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + /* + * If the directory looks as though it has been zapped by the + * inode record repair code, we cannot scan for child dirents. + */ + if (xchk_dir_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd); + if (error) + goto scan_done; + } + +scan_done: + xchk_iscan_mark_visited(&rd->pscan.iscan, ip); + xfs_iunlock(ip, lock_mode); + return error; +} + +/* + * Scan all files in the filesystem for parent pointers that we can turn into + * replacement dirents, and a dirent that we can use to set the dotdot pointer. + */ +STATIC int +xrep_dir_scan_dirtree( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *ip; + int error; + + /* Roots of directory trees are their own parents. */ + if (sc->ip == sc->mp->m_rootip) + xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino); + + /* + * Filesystem scans are time consuming. Drop the directory ILOCK and + * all other resources for the duration of the scan and hope for the + * best. The live update hooks will keep our scan information up to + * date even though we've dropped the locks. + */ + xchk_trans_cancel(sc); + if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) + xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | + XFS_ILOCK_EXCL)); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) { + bool flush; + + error = xrep_dir_scan_file(rd, ip); + xchk_irele(sc, ip); + if (error) + break; + + /* Flush stashed dirent updates to constrain memory usage. */ + mutex_lock(&rd->pscan.lock); + flush = xrep_dir_want_flush_stashed(rd); + mutex_unlock(&rd->pscan.lock); + if (flush) { + xchk_trans_cancel(sc); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + break; + + error = xrep_dir_replay_updates(rd); + xrep_tempfile_iounlock(sc); + if (error) + break; + + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rd->pscan.iscan); + if (error) { + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Cancel the empty transaction so that we can (later) use the atomic + * file mapping exchange functions to lock files and commit the new + * directory. + */ + xchk_trans_cancel(rd->sc); + return 0; +} + +/* + * Capture dirent updates being made by other threads which are relevant to the + * directory being repaired. + */ +STATIC int +xrep_dir_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_dir *rd; + struct xfs_scrub *sc; + int error = 0; + + rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb); + sc = rd->sc; + + /* + * This thread updated a child dirent in the directory that we're + * rebuilding. Stash the update for replay against the temporary + * directory. + */ + if (p->dp->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) { + mutex_lock(&rd->pscan.lock); + if (p->delta > 0) + error = xrep_dir_stash_createname(rd, p->name, + p->ip->i_ino); + else + error = xrep_dir_stash_removename(rd, p->name, + p->ip->i_ino); + mutex_unlock(&rd->pscan.lock); + if (error) + goto out_abort; + } + + /* + * This thread updated another directory's child dirent that points to + * the directory that we're rebuilding, so remember the new dotdot + * target. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) { + if (p->delta > 0) { + trace_xrep_dir_stash_createname(sc->tempip, + &xfs_name_dotdot, + p->dp->i_ino); + + xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino); + } else { + trace_xrep_dir_stash_removename(sc->tempip, + &xfs_name_dotdot, + rd->pscan.parent_ino); + + xrep_findparent_scan_found(&rd->pscan, NULLFSINO); + } + } + + return NOTIFY_DONE; +out_abort: + xchk_iscan_abort(&rd->pscan.iscan); + return NOTIFY_DONE; +} + +/* + * Free all the directory blocks and reset the data fork. The caller must + * join the inode to the transaction. This function returns with the inode + * joined to a clean scrub transaction. + */ +STATIC int +xrep_dir_reset_fork( + struct xrep_dir *rd, + xfs_ino_t parent_ino) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); + int error; + + /* Unmap all the directory buffers. */ + if (xfs_ifork_has_extents(ifp)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); + if (error) + return error; + } + + trace_xrep_dir_reset_fork(sc->tempip, parent_ino); + + /* Reset the data fork to an empty data fork. */ + xfs_idestroy_fork(ifp); + ifp->if_bytes = 0; + sc->tempip->i_disk_size = 0; + + /* Reinitialize the short form directory. */ + xrep_dir_init_args(rd, sc->tempip, NULL); + return xfs_dir2_sf_create(&rd->args, parent_ino); +} + +/* + * Prepare both inodes' directory forks for exchanging mappings. Promote the + * tempfile from short format to leaf format, and if the file being repaired + * has a short format data fork, turn it into an empty extent list. + */ +STATIC int +xrep_dir_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the tempfile's directory is in shortform format, convert that to + * a single leaf extent so that we can use the atomic mapping exchange. + */ + if (temp_local) { + struct xfs_da_args args = { + .dp = sc->tempip, + .geo = sc->mp->m_dir_geo, + .whichfork = XFS_DATA_FORK, + .trans = sc->tp, + .total = 1, + .owner = sc->ip->i_ino, + }; + + error = xfs_dir2_sf_to_block(&args); + if (error) + return error; + + /* + * Roll the deferred log items to get us back to a clean + * transaction. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform data fork, convert that + * to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_DDATA); + } + + return 0; +} + +/* + * Replace the inode number of a directory entry. + */ +static int +xrep_dir_replace( + struct xrep_dir *rd, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t inum, + xfs_extlen_t total) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + error = xfs_dir_ino_validate(sc->mp, inum); + if (error) + return error; + + xrep_dir_init_args(rd, dp, name); + rd->args.inumber = inum; + rd->args.total = total; + return xfs_dir_replace_args(&rd->args); +} + +/* + * Reset the link count of this directory and adjust the unlinked list pointers + * as needed. + */ +STATIC int +xrep_dir_set_nlink( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *dp = sc->ip; + struct xfs_perag *pag; + unsigned int new_nlink = min_t(unsigned long long, + rd->subdirs + 2, + XFS_NLINK_PINNED); + int error; + + /* + * The directory is not on the incore unlinked list, which means that + * it needs to be reachable via the directory tree. Update the nlink + * with our observed link count. If the directory has no parent, it + * will be moved to the orphanage. + */ + if (!xfs_inode_on_unlinked_list(dp)) + goto reset_nlink; + + /* + * The directory is on the unlinked list and we did not find any + * dirents. Set the link count to zero and let the directory + * inactivate when the last reference drops. + */ + if (rd->dirents == 0) { + rd->needs_adoption = false; + new_nlink = 0; + goto reset_nlink; + } + + /* + * The directory is on the unlinked list and we found dirents. This + * directory needs to be reachable via the directory tree. Remove the + * dir from the unlinked list and update nlink with the observed link + * count. If the directory has no parent, it will be moved to the + * orphanage. + */ + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino)); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_remove(sc->tp, pag, dp); + xfs_perag_put(pag); + if (error) + return error; + +reset_nlink: + if (VFS_I(dp)->i_nlink != new_nlink) + set_nlink(VFS_I(dp), new_nlink); + return 0; +} + +/* + * Finish replaying stashed dirent updates, allocate a transaction for + * exchanging data fork mappings, and take the ILOCKs of both directories + * before we commit the new directory structure. + */ +STATIC int +xrep_dir_finalize_tempdir( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); + + /* + * Repair relies on the ILOCK to quiesce all possible dirent updates. + * Replay all queued dirent updates into the tempdir before exchanging + * the contents, even if that means dropping the ILOCKs and the + * transaction. + */ + do { + error = xrep_dir_replay_updates(rd); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); + if (error) + return error; + + if (xfarray_length(rd->dir_entries) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* Exchange the temporary directory's data fork with the one being repaired. */ +STATIC int +xrep_dir_swap( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + bool ip_local, temp_local; + int error = 0; + + /* + * If we never found the parent for this directory, temporarily assign + * the root dir as the parent; we'll move this to the orphanage after + * exchanging the dir contents. We hold the ILOCK of the dir being + * repaired, so we're not worried about racy updates of dotdot. + */ + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + if (rd->pscan.parent_ino == NULLFSINO) { + rd->needs_adoption = true; + rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino; + } + + /* + * Reset the temporary directory's '..' entry to point to the parent + * that we found. The temporary directory was created with the root + * directory as the parent, so we can skip this if repairing a + * subdirectory of the root. + * + * It's also possible that this replacement could also expand a sf + * tempdir into block format. + */ + if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) { + error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot, + rd->pscan.parent_ino, rd->tx.req.resblks); + if (error) + return error; + } + + /* + * Changing the dot and dotdot entries could have changed the shape of + * the directory, so we recompute these. + */ + ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both files have a local format data fork and the rebuilt + * directory data would fit in the repaired file's data fork, copy + * the contents from the tempfile and update the directory link count. + * We're done now. + */ + if (ip_local && temp_local && + sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { + xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); + return xrep_dir_set_nlink(rd); + } + + /* + * Clean the transaction before we start working on exchanging + * directory contents. + */ + error = xrep_tempfile_roll_trans(rd->sc); + if (error) + return error; + + /* Otherwise, make sure both data forks are in block-mapping mode. */ + error = xrep_dir_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + /* + * Set nlink of the directory in the same transaction sequence that + * (atomically) commits the new directory data. + */ + error = xrep_dir_set_nlink(rd); + if (error) + return error; + + return xrep_tempexch_contents(sc, &rd->tx); +} + +/* + * Exchange the new directory contents (which we created in the tempfile) with + * the directory being repaired. + */ +STATIC int +xrep_dir_rebuild_tree( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino); + + /* + * Take the IOLOCK on the temporary file so that we can run dir + * operations with the same locks held as we would for a normal file. + * We still hold sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rd->sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed dirent updates to the tempdir. After this point, + * we're ready to exchange data fork mappings. + */ + error = xrep_dir_finalize_tempdir(rd); + if (error) + return error; + + if (xchk_iscan_aborted(&rd->pscan.iscan)) + return -ECANCELED; + + /* + * Exchange the tempdir's data fork with the file being repaired. This + * recreates the transaction and re-takes the ILOCK in the scrub + * context. + */ + error = xrep_dir_swap(rd); + if (error) + return error; + + /* + * Release the old directory blocks and reset the data fork of the temp + * directory to an empty shortform directory because inactivation does + * nothing for directories. + */ + error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target directory. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + return 0; +} + +/* Set up the filesystem scan so we can regenerate directory entries. */ +STATIC int +xrep_dir_setup_scan( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + char *descr; + int error; + + /* Set up some staging memory for salvaging dirents. */ + descr = xchk_xfile_ino_descr(sc, "directory entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_dirent), + &rd->dir_entries); + kfree(descr); + if (error) + return error; + + descr = xchk_xfile_ino_descr(sc, "directory entry names"); + error = xfblob_create(descr, &rd->dir_names); + kfree(descr); + if (error) + goto out_xfarray; + + if (xfs_has_parent(sc->mp)) + error = __xrep_findparent_scan_start(sc, &rd->pscan, + xrep_dir_live_update); + else + error = xrep_findparent_scan_start(sc, &rd->pscan); + if (error) + goto out_xfblob; + + return 0; + +out_xfblob: + xfblob_destroy(rd->dir_names); + rd->dir_names = NULL; +out_xfarray: + xfarray_destroy(rd->dir_entries); + rd->dir_entries = NULL; + return error; +} + +/* + * Move the current file to the orphanage. + * + * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon + * successful return, the scrub transaction will have enough extra reservation + * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the + * orphanage; and both inodes will be ijoined. + */ +STATIC int +xrep_dir_move_to_orphanage( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t orig_parent, new_parent; + int error; + + /* + * We are about to drop the ILOCK on sc->ip to lock the orphanage and + * prepare for the adoption. Therefore, look up the old dotdot entry + * for sc->ip so that we can compare it after we re-lock sc->ip. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent); + if (error) + return error; + + /* + * Drop the ILOCK on the scrub target and commit the transaction. + * Adoption computes its own resource requirements and gathers the + * necessary components. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* If we can take the orphanage's iolock then we're ready to move. */ + if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + xchk_iunlock(sc, sc->ilock_flags); + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; + } + + /* Grab transaction and ILOCK the two files. */ + error = xrep_adoption_trans_alloc(sc, &rd->adoption); + if (error) + return error; + + error = xrep_adoption_compute_name(&rd->adoption, &rd->xname); + if (error) + return error; + + /* + * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot + * entry again. If the parent changed or the child was unlinked while + * the child directory was unlocked, we don't need to move the child to + * the orphanage after all. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent); + if (error) + return error; + + /* + * Attach to the orphanage if we still have a linked directory and it + * hasn't been moved. + */ + if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { + error = xrep_adoption_move(&rd->adoption); + if (error) + return error; + } + + /* + * Launder the scrub transaction so we can drop the orphanage ILOCK + * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. + */ + error = xrep_adoption_trans_roll(&rd->adoption); + if (error) + return error; + + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* + * Repair the directory metadata. + * + * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer + * cache in XFS can't handle aliased multiblock buffers, so this might + * misbehave if the directory blocks are crosslinked with other filesystem + * metadata. + * + * XXX: Is it necessary to check the dcache for this directory to make sure + * that we always recreate every cached entry? + */ +int +xrep_directory( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd = sc->buf; + int error; + + /* The rmapbt is required to reap the old data fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + error = xrep_dir_setup_scan(rd); + if (error) + return error; + + if (xfs_has_parent(sc->mp)) + error = xrep_dir_scan_dirtree(rd); + else + error = xrep_dir_salvage_entries(rd); + if (error) + goto out_teardown; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_teardown; + + error = xrep_dir_rebuild_tree(rd); + if (error) + goto out_teardown; + + if (rd->needs_adoption) { + if (!xrep_orphanage_can_adopt(rd->sc)) + error = -EFSCORRUPTED; + else + error = xrep_dir_move_to_orphanage(rd); + if (error) + goto out_teardown; + } + +out_teardown: + xrep_dir_teardown(sc); + return error; +} diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c new file mode 100644 index 000000000000..bde58fb561ea --- /dev/null +++ b/fs/xfs/scrub/dirtree.c @@ -0,0 +1,985 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/listxattr.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/orphanage.h" +#include "scrub/dirtree.h" + +/* + * Directory Tree Structure Validation + * =================================== + * + * Validating the tree qualities of the directory tree structure can be + * difficult. If the tree is frozen, running a depth (or breadth) first search + * and marking a bitmap suffices to determine if there is a cycle. XORing the + * mark bitmap with the inode bitmap afterwards tells us if there are + * disconnected cycles. If the tree is not frozen, directory updates can move + * subtrees across the scanner wavefront, which complicates the design greatly. + * + * Directory parent pointers change that by enabling an incremental approach to + * validation of the tree structure. Instead of using one thread to scan the + * entire filesystem, we instead can have multiple threads walking individual + * subdirectories upwards to the root. In a perfect world, the IOLOCK would + * suffice to stabilize two directories in a parent -> child relationship. + * Unfortunately, the VFS does not take the IOLOCK when moving a child + * subdirectory, so we instead synchronize on ILOCK and use dirent update hooks + * to detect a race. If a race occurs in a path, we restart the scan. + * + * If the walk terminates without reaching the root, we know the path is + * disconnected and ought to be attached to the lost and found. If on the walk + * we find the same subdir that we're scanning, we know this is a cycle and + * should delete an incoming edge. If we find multiple paths to the root, we + * know to delete an incoming edge. + * + * There are two big hitches with this approach: first, all file link counts + * must be correct to prevent other writers from doing the wrong thing with the + * directory tree structure. Second, because we're walking upwards in a tree + * of arbitrary depth, we cannot hold all the ILOCKs. Instead, we will use a + * directory update hook to invalidate the scan results if one of the paths + * we've scanned has changed. + */ + +/* Clean up the dirtree checking resources. */ +STATIC void +xchk_dirtree_buf_cleanup( + void *buf) +{ + struct xchk_dirtree *dl = buf; + struct xchk_dirpath *path, *n; + + if (dl->scan_ino != NULLFSINO) + xfs_dir_hook_del(dl->sc->mp, &dl->dhook); + + xchk_dirtree_for_each_path_safe(dl, path, n) { + list_del_init(&path->list); + xino_bitmap_destroy(&path->seen_inodes); + kfree(path); + } + + xfblob_destroy(dl->path_names); + xfarray_destroy(dl->path_steps); + mutex_destroy(&dl->lock); +} + +/* Set us up to look for directory loops. */ +int +xchk_setup_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree *dl; + char *descr; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + if (xchk_could_repair(sc)) { + error = xrep_setup_dirtree(sc); + if (error) + return error; + } + + dl = kvzalloc(sizeof(struct xchk_dirtree), XCHK_GFP_FLAGS); + if (!dl) + return -ENOMEM; + dl->sc = sc; + dl->xname.name = dl->namebuf; + dl->hook_xname.name = dl->hook_namebuf; + INIT_LIST_HEAD(&dl->path_list); + dl->root_ino = NULLFSINO; + dl->scan_ino = NULLFSINO; + dl->parent_ino = NULLFSINO; + + mutex_init(&dl->lock); + + descr = xchk_xfile_ino_descr(sc, "dirtree path steps"); + error = xfarray_create(descr, 0, sizeof(struct xchk_dirpath_step), + &dl->path_steps); + kfree(descr); + if (error) + goto out_dl; + + descr = xchk_xfile_ino_descr(sc, "dirtree path names"); + error = xfblob_create(descr, &dl->path_names); + kfree(descr); + if (error) + goto out_steps; + + error = xchk_setup_inode_contents(sc, 0); + if (error) + goto out_names; + + sc->buf = dl; + sc->buf_cleanup = xchk_dirtree_buf_cleanup; + return 0; + +out_names: + xfblob_destroy(dl->path_names); +out_steps: + xfarray_destroy(dl->path_steps); +out_dl: + mutex_destroy(&dl->lock); + kvfree(dl); + return error; +} + +/* + * Add the parent pointer described by @dl->pptr to the given path as a new + * step. Returns -ELNRNG if the path is too deep. + */ +int +xchk_dirpath_append( + struct xchk_dirtree *dl, + struct xfs_inode *ip, + struct xchk_dirpath *path, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr) +{ + struct xchk_dirpath_step step = { + .pptr_rec = *pptr, /* struct copy */ + .name_len = name->len, + }; + int error; + + /* + * If this path is more than 2 billion steps long, this directory tree + * is too far gone to fix. + */ + if (path->nr_steps >= XFS_MAXLINK) + return -ELNRNG; + + error = xfblob_storename(dl->path_names, &step.name_cookie, name); + if (error) + return error; + + error = xino_bitmap_set(&path->seen_inodes, ip->i_ino); + if (error) + return error; + + error = xfarray_append(dl->path_steps, &step); + if (error) + return error; + + path->nr_steps++; + return 0; +} + +/* + * Create an xchk_path for each parent pointer of the directory that we're + * scanning. For each path created, we will eventually try to walk towards the + * root with the goal of deleting all parents except for one that leads to the + * root. + * + * Returns -EFSCORRUPTED to signal that the inode being scanned has a corrupt + * parent pointer and hence there's no point in continuing; or -ENOSR if there + * are too many parent pointers for this directory. + */ +STATIC int +xchk_dirtree_create_path( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_dirtree *dl = priv; + struct xchk_dirpath *path; + const struct xfs_parent_rec *rec = value; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + /* + * If there are more than 2 billion actual parent pointers for this + * subdirectory, this fs is too far gone to fix. + */ + if (dl->nr_paths >= XFS_MAXLINK) + return -ENOSR; + + trace_xchk_dirtree_create_path(sc, ip, dl->nr_paths, &xname, rec); + + /* + * Create a new xchk_path structure to remember this parent pointer + * and record the first name step. + */ + path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS); + if (!path) + return -ENOMEM; + + INIT_LIST_HEAD(&path->list); + xino_bitmap_init(&path->seen_inodes); + path->nr_steps = 0; + path->outcome = XCHK_DIRPATH_SCANNING; + + error = xchk_dirpath_append(dl, sc->ip, path, &xname, rec); + if (error) + goto out_path; + + path->first_step = xfarray_length(dl->path_steps) - 1; + path->second_step = XFARRAY_NULLIDX; + path->path_nr = dl->nr_paths; + + list_add_tail(&path->list, &dl->path_list); + dl->nr_paths++; + return 0; +out_path: + kfree(path); + return error; +} + +/* + * Validate that the first step of this path still has a corresponding + * parent pointer in @sc->ip. We probably dropped @sc->ip's ILOCK while + * walking towards the roots, which is why this is necessary. + * + * This function has a side effect of loading the first parent pointer of this + * path into the parent pointer scratch pad. This prepares us to walk up the + * directory tree towards the root. Returns -ESTALE if the scan data is now + * out of date. + */ +STATIC int +xchk_dirpath_revalidate( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* + * Look up the parent pointer that corresponds to the start of this + * path. If the parent pointer has disappeared on us, dump all the + * scan results and try again. + */ + error = xfs_parent_lookup(sc->tp, sc->ip, &dl->xname, &dl->pptr_rec, + &dl->pptr_args); + if (error == -ENOATTR) { + trace_xchk_dirpath_disappeared(dl->sc, sc->ip, path->path_nr, + path->first_step, &dl->xname, &dl->pptr_rec); + dl->stale = true; + return -ESTALE; + } + + return error; +} + +/* + * Walk the parent pointers of a directory at the end of a path and record + * the parent that we find in @dl->xname/pptr_rec. + */ +STATIC int +xchk_dirpath_find_next_step( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_dirtree *dl = priv; + const struct xfs_parent_rec *rec = value; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + /* + * If we've already set @dl->pptr_rec, then this directory has multiple + * parents. Signal this back to the caller via -EMLINK. + */ + if (dl->parents_found > 0) + return -EMLINK; + + dl->parents_found++; + memcpy(dl->namebuf, name, namelen); + dl->xname.len = namelen; + dl->pptr_rec = *rec; /* struct copy */ + return 0; +} + +/* Set and log the outcome of a path walk. */ +static inline void +xchk_dirpath_set_outcome( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + enum xchk_dirpath_outcome outcome) +{ + trace_xchk_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps, + outcome); + + path->outcome = outcome; +} + +/* + * Scan the directory at the end of this path for its parent directory link. + * If we find one, extend the path. Returns -ESTALE if the scan data out of + * date. Returns -EFSCORRUPTED if the parent pointer is bad; or -ELNRNG if + * the path got too deep. + */ +STATIC int +xchk_dirpath_step_up( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + struct xfs_inode *dp; + xfs_ino_t parent_ino = be64_to_cpu(dl->pptr_rec.p_ino); + unsigned int lock_mode; + int error; + + /* Grab and lock the parent directory. */ + error = xchk_iget(sc, parent_ino, &dp); + if (error) + return error; + + lock_mode = xfs_ilock_attr_map_shared(dp); + mutex_lock(&dl->lock); + + if (dl->stale) { + error = -ESTALE; + goto out_scanlock; + } + + /* We've reached the root directory; the path is ok. */ + if (parent_ino == dl->root_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_OK); + error = 0; + goto out_scanlock; + } + + /* + * The inode being scanned is its own distant ancestor! Get rid of + * this path. + */ + if (parent_ino == sc->ip->i_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + error = 0; + goto out_scanlock; + } + + /* + * We've seen this inode before during the path walk. There's a loop + * above us in the directory tree. This probably means that we cannot + * continue, but let's keep walking paths to get a full picture. + */ + if (xino_bitmap_test(&path->seen_inodes, parent_ino)) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_LOOP); + error = 0; + goto out_scanlock; + } + + /* The handle encoded in the parent pointer must match. */ + if (VFS_I(dp)->i_generation != be32_to_cpu(dl->pptr_rec.p_gen)) { + trace_xchk_dirpath_badgen(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* Parent pointer must point up to a directory. */ + if (!S_ISDIR(VFS_I(dp)->i_mode)) { + trace_xchk_dirpath_nondir_parent(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* Parent cannot be an unlinked directory. */ + if (VFS_I(dp)->i_nlink == 0) { + trace_xchk_dirpath_unlinked_parent(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* + * If the extended attributes look as though they has been zapped by + * the inode record repair code, we cannot scan for parent pointers. + */ + if (xchk_pptr_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_scanlock; + } + + /* + * Walk the parent pointers of @dp to find the parent of this directory + * to find the next step in our walk. If we find that @dp has exactly + * one parent, the parent pointer information will be stored in + * @dl->pptr_rec. This prepares us for the next step of the walk. + */ + mutex_unlock(&dl->lock); + dl->parents_found = 0; + error = xchk_xattr_walk(sc, dp, xchk_dirpath_find_next_step, NULL, dl); + mutex_lock(&dl->lock); + if (error == -EFSCORRUPTED || error == -EMLINK || + (!error && dl->parents_found == 0)) { + /* + * Further up the directory tree from @sc->ip, we found a + * corrupt parent pointer, multiple parent pointers while + * finding this directory's parent, or zero parents despite + * having a nonzero link count. Keep looking for other paths. + */ + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT); + error = 0; + goto out_scanlock; + } + if (error) + goto out_scanlock; + + if (dl->stale) { + error = -ESTALE; + goto out_scanlock; + } + + trace_xchk_dirpath_found_next_step(sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + + /* Append to the path steps */ + error = xchk_dirpath_append(dl, dp, path, &dl->xname, &dl->pptr_rec); + if (error) + goto out_scanlock; + + if (path->second_step == XFARRAY_NULLIDX) + path->second_step = xfarray_length(dl->path_steps) - 1; + +out_scanlock: + mutex_unlock(&dl->lock); + xfs_iunlock(dp, lock_mode); + xchk_irele(sc, dp); + return error; +} + +/* + * Walk the directory tree upwards towards what is hopefully the root + * directory, recording path steps as we go. The current path components are + * stored in dl->pptr_rec and dl->xname. + * + * Returns -ESTALE if the scan data are out of date. Returns -EFSCORRUPTED + * only if the direct parent pointer of @sc->ip associated with this path is + * corrupt. + */ +STATIC int +xchk_dirpath_walk_upwards( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + /* Reload the start of this path and make sure it's still there. */ + error = xchk_dirpath_revalidate(dl, path); + if (error) + return error; + + trace_xchk_dirpath_walk_upwards(sc, sc->ip, path->path_nr, &dl->xname, + &dl->pptr_rec); + + /* + * The inode being scanned is its own direct ancestor! + * Get rid of this path. + */ + if (be64_to_cpu(dl->pptr_rec.p_ino) == sc->ip->i_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + return 0; + } + + /* + * Drop ILOCK_EXCL on the inode being scanned. We still hold + * IOLOCK_EXCL on it, so it cannot move around or be renamed. + * + * Beyond this point we're walking up the directory tree, which means + * that we can acquire and drop the ILOCK on an alias of sc->ip. The + * ILOCK state is no longer tracked in the scrub context. Hence we + * must drop @sc->ip's ILOCK during the walk. + */ + mutex_unlock(&dl->lock); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* + * Take the first step in the walk towards the root by checking the + * start of this path, which is a direct parent pointer of @sc->ip. + * If we see any kind of error here (including corruptions), the parent + * pointer of @sc->ip is corrupt. Stop the whole scan. + */ + error = xchk_dirpath_step_up(dl, path); + if (error) { + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + return error; + } + + /* + * Take steps upward from the second step in this path towards the + * root. If we hit corruption errors here, there's a problem + * *somewhere* in the path, but we don't need to stop scanning. + */ + while (!error && path->outcome == XCHK_DIRPATH_SCANNING) + error = xchk_dirpath_step_up(dl, path); + + /* Retake the locks we had, mark paths, etc. */ + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + if (error == -EFSCORRUPTED) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT); + error = 0; + } + if (!error && dl->stale) + return -ESTALE; + return error; +} + +/* + * Decide if this path step has been touched by this live update. Returns + * 1 for yes, 0 for no, or a negative errno. + */ +STATIC int +xchk_dirpath_step_is_stale( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + unsigned int step_nr, + xfarray_idx_t step_idx, + struct xfs_dir_update_params *p, + xfs_ino_t *cursor) +{ + struct xchk_dirpath_step step; + xfs_ino_t child_ino = *cursor; + int error; + + error = xfarray_load(dl->path_steps, step_idx, &step); + if (error) + return error; + *cursor = be64_to_cpu(step.pptr_rec.p_ino); + + /* + * If the parent and child being updated are not the ones mentioned in + * this path step, the scan data is still ok. + */ + if (p->ip->i_ino != child_ino || p->dp->i_ino != *cursor) + return 0; + + /* + * If the dirent name lengths or byte sequences are different, the scan + * data is still ok. + */ + if (p->name->len != step.name_len) + return 0; + + error = xfblob_loadname(dl->path_names, step.name_cookie, + &dl->hook_xname, step.name_len); + if (error) + return error; + + if (memcmp(dl->hook_xname.name, p->name->name, p->name->len) != 0) + return 0; + + /* + * If the update comes from the repair code itself, walk the state + * machine forward. + */ + if (p->ip->i_ino == dl->scan_ino && + path->outcome == XREP_DIRPATH_ADOPTING) { + xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_ADOPTED); + return 0; + } + + if (p->ip->i_ino == dl->scan_ino && + path->outcome == XREP_DIRPATH_DELETING) { + xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETED); + return 0; + } + + /* Exact match, scan data is out of date. */ + trace_xchk_dirpath_changed(dl->sc, path->path_nr, step_nr, p->dp, + p->ip, p->name); + return 1; +} + +/* + * Decide if this path has been touched by this live update. Returns 1 for + * yes, 0 for no, or a negative errno. + */ +STATIC int +xchk_dirpath_is_stale( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + struct xfs_dir_update_params *p) +{ + xfs_ino_t cursor = dl->scan_ino; + xfarray_idx_t idx = path->first_step; + unsigned int i; + int ret; + + /* + * The child being updated has not been seen by this path at all; this + * path cannot be stale. + */ + if (!xino_bitmap_test(&path->seen_inodes, p->ip->i_ino)) + return 0; + + ret = xchk_dirpath_step_is_stale(dl, path, 0, idx, p, &cursor); + if (ret != 0) + return ret; + + for (i = 1, idx = path->second_step; i < path->nr_steps; i++, idx++) { + ret = xchk_dirpath_step_is_stale(dl, path, i, idx, p, &cursor); + if (ret != 0) + return ret; + } + + return 0; +} + +/* + * Decide if a directory update from the regular filesystem touches any of the + * paths we've scanned, and invalidate the scan data if true. + */ +STATIC int +xchk_dirtree_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xchk_dirtree *dl; + struct xchk_dirpath *path; + int ret; + + dl = container_of(nb, struct xchk_dirtree, dhook.dirent_hook.nb); + + trace_xchk_dirtree_live_update(dl->sc, p->dp, action, p->ip, p->delta, + p->name); + + mutex_lock(&dl->lock); + + if (dl->stale || dl->aborted) + goto out_unlock; + + xchk_dirtree_for_each_path(dl, path) { + ret = xchk_dirpath_is_stale(dl, path, p); + if (ret < 0) { + dl->aborted = true; + break; + } + if (ret == 1) { + dl->stale = true; + break; + } + } + +out_unlock: + mutex_unlock(&dl->lock); + return NOTIFY_DONE; +} + +/* Delete all the collected path information. */ +STATIC void +xchk_dirtree_reset( + void *buf) +{ + struct xchk_dirtree *dl = buf; + struct xchk_dirpath *path, *n; + + ASSERT(dl->sc->ilock_flags & XFS_ILOCK_EXCL); + + xchk_dirtree_for_each_path_safe(dl, path, n) { + list_del_init(&path->list); + xino_bitmap_destroy(&path->seen_inodes); + kfree(path); + } + dl->nr_paths = 0; + + xfarray_truncate(dl->path_steps); + xfblob_truncate(dl->path_names); + + dl->stale = false; +} + +/* + * Load the name/pptr from the first step in this path into @dl->pptr_rec and + * @dl->xname. + */ +STATIC int +xchk_dirtree_load_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + int error; + + error = xfarray_load(dl->path_steps, path->first_step, &step); + if (error) + return error; + + error = xfblob_loadname(dl->path_names, step.name_cookie, &dl->xname, + step.name_len); + if (error) + return error; + + dl->pptr_rec = step.pptr_rec; /* struct copy */ + return 0; +} + +/* + * For each parent pointer of this subdir, trace a path upwards towards the + * root directory and record what we find. Returns 0 for success; + * -EFSCORRUPTED if walking the parent pointers of @sc->ip failed, -ELNRNG if a + * path was too deep; -ENOSR if there were too many parent pointers; or + * a negative errno. + */ +int +xchk_dirtree_find_paths_to_root( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + struct xchk_dirpath *path; + int error = 0; + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + xchk_dirtree_reset(dl); + + /* + * If the extended attributes look as though they has been + * zapped by the inode record repair code, we cannot scan for + * parent pointers. + */ + if (xchk_pptr_looks_zapped(sc->ip)) { + xchk_set_incomplete(sc); + return -EBUSY; + } + + /* + * Create path walk contexts for each parent of the directory + * that is being scanned. Directories are supposed to have + * only one parent, but this is how we detect multiple parents. + */ + error = xchk_xattr_walk(sc, sc->ip, xchk_dirtree_create_path, + NULL, dl); + if (error) + return error; + + xchk_dirtree_for_each_path(dl, path) { + /* Load path components into dl->pptr/xname */ + error = xchk_dirtree_load_path(dl, path); + if (error) + return error; + + /* + * Try to walk up each path to the root. This enables + * us to find directory loops in ancestors, and the + * like. + */ + error = xchk_dirpath_walk_upwards(dl, path); + if (error == -EFSCORRUPTED) { + /* + * A parent pointer of @sc->ip is bad, don't + * bother continuing. + */ + break; + } + if (error == -ESTALE) { + /* This had better be an invalidation. */ + ASSERT(dl->stale); + break; + } + if (error) + return error; + if (dl->aborted) + return 0; + } + } while (dl->stale); + + return error; +} + +/* + * Figure out what to do with the paths we tried to find. Do not call this + * if the scan results are stale. + */ +void +xchk_dirtree_evaluate( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + + ASSERT(!dl->stale); + + /* Scan the paths we have to decide what to do. */ + memset(oc, 0, sizeof(struct xchk_dirtree_outcomes)); + xchk_dirtree_for_each_path(dl, path) { + trace_xchk_dirpath_evaluate_path(dl->sc, path->path_nr, + path->nr_steps, path->outcome); + + switch (path->outcome) { + case XCHK_DIRPATH_SCANNING: + /* shouldn't get here */ + ASSERT(0); + break; + case XCHK_DIRPATH_DELETE: + /* This one is already going away. */ + oc->bad++; + break; + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + /* Couldn't find the end of this path. */ + oc->suspect++; + break; + case XCHK_DIRPATH_STALE: + /* shouldn't get here either */ + ASSERT(0); + break; + case XCHK_DIRPATH_OK: + /* This path got all the way to the root. */ + oc->good++; + break; + case XREP_DIRPATH_DELETING: + case XREP_DIRPATH_DELETED: + case XREP_DIRPATH_ADOPTING: + case XREP_DIRPATH_ADOPTED: + /* These should not be in progress! */ + ASSERT(0); + break; + } + } + + trace_xchk_dirtree_evaluate(dl, oc); +} + +/* Look for directory loops. */ +int +xchk_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree_outcomes oc; + struct xchk_dirtree *dl = sc->buf; + int error; + + /* + * Nondirectories do not point downwards to other files, so they cannot + * cause a cycle in the directory tree. + */ + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + ASSERT(xfs_has_parent(sc->mp)); + + /* + * Find the root of the directory tree. Remember which directory to + * scan, because the hook doesn't detach until after sc->ip gets + * released during teardown. + */ + dl->root_ino = sc->mp->m_rootip->i_ino; + dl->scan_ino = sc->ip->i_ino; + + trace_xchk_dirtree_start(sc->ip, sc->sm, 0); + + /* + * Hook into the directory entry code so that we can capture updates to + * paths that we have already scanned. The scanner thread takes each + * directory's ILOCK, which means that any in-progress directory update + * will finish before we can scan the directory. + */ + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + xfs_dir_hook_setup(&dl->dhook, xchk_dirtree_live_update); + error = xfs_dir_hook_add(sc->mp, &dl->dhook); + if (error) + goto out; + + mutex_lock(&dl->lock); + + /* Trace each parent pointer's path to the root. */ + error = xchk_dirtree_find_paths_to_root(dl); + if (error == -EFSCORRUPTED || error == -ELNRNG || error == -ENOSR) { + /* + * Don't bother walking the paths if the xattr structure or the + * parent pointers are corrupt; this scan cannot be completed + * without full information. + */ + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + error = 0; + goto out_scanlock; + } + if (error == -EBUSY) { + /* + * We couldn't scan some directory's parent pointers because + * the attr fork looked like it had been zapped. The + * scan was marked incomplete, so no further error code + * is necessary. + */ + error = 0; + goto out_scanlock; + } + if (error) + goto out_scanlock; + if (dl->aborted) { + xchk_set_incomplete(sc); + goto out_scanlock; + } + + /* Assess what we found in our path evaluation. */ + xchk_dirtree_evaluate(dl, &oc); + if (xchk_dirtree_parentless(dl)) { + if (oc.good || oc.bad || oc.suspect) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (oc.bad || oc.good + oc.suspect != 1) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + if (oc.suspect) + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + } + +out_scanlock: + mutex_unlock(&dl->lock); +out: + trace_xchk_dirtree_done(sc->ip, sc->sm, error); + return error; +} diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h new file mode 100644 index 000000000000..1e1686365c61 --- /dev/null +++ b/fs/xfs/scrub/dirtree.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_DIRTREE_H__ +#define __XFS_SCRUB_DIRTREE_H__ + +/* + * Each of these represents one parent pointer path step in a chain going + * up towards the directory tree root. These are stored inside an xfarray. + */ +struct xchk_dirpath_step { + /* Directory entry name associated with this parent link. */ + xfblob_cookie name_cookie; + unsigned int name_len; + + /* Handle of the parent directory. */ + struct xfs_parent_rec pptr_rec; +}; + +enum xchk_dirpath_outcome { + XCHK_DIRPATH_SCANNING = 0, /* still being put together */ + XCHK_DIRPATH_DELETE, /* delete this path */ + XCHK_DIRPATH_CORRUPT, /* corruption detected in path */ + XCHK_DIRPATH_LOOP, /* cycle detected further up */ + XCHK_DIRPATH_STALE, /* path is stale */ + XCHK_DIRPATH_OK, /* path reaches the root */ + + XREP_DIRPATH_DELETING, /* path is being deleted */ + XREP_DIRPATH_DELETED, /* path has been deleted */ + XREP_DIRPATH_ADOPTING, /* path is being adopted */ + XREP_DIRPATH_ADOPTED, /* path has been adopted */ +}; + +/* + * Each of these represents one parent pointer path out of the directory being + * scanned. These exist in-core, and hopefully there aren't more than a + * handful of them. + */ +struct xchk_dirpath { + struct list_head list; + + /* Index of the first step in this path. */ + xfarray_idx_t first_step; + + /* Index of the second step in this path. */ + xfarray_idx_t second_step; + + /* Inodes seen while walking this path. */ + struct xino_bitmap seen_inodes; + + /* Number of steps in this path. */ + unsigned int nr_steps; + + /* Which path is this? */ + unsigned int path_nr; + + /* What did we conclude from following this path? */ + enum xchk_dirpath_outcome outcome; +}; + +struct xchk_dirtree_outcomes { + /* Number of XCHK_DIRPATH_DELETE */ + unsigned int bad; + + /* Number of XCHK_DIRPATH_CORRUPT or XCHK_DIRPATH_LOOP */ + unsigned int suspect; + + /* Number of XCHK_DIRPATH_OK */ + unsigned int good; + + /* Directory needs to be added to lost+found */ + bool needs_adoption; +}; + +struct xchk_dirtree { + struct xfs_scrub *sc; + + /* Root inode that we're looking for. */ + xfs_ino_t root_ino; + + /* + * This is the inode that we're scanning. The live update hook can + * continue to be called after xchk_teardown drops sc->ip but before + * it calls buf_cleanup, so we keep a copy. + */ + xfs_ino_t scan_ino; + + /* + * If we start deleting redundant paths to this subdirectory, this is + * the inode number of the surviving parent and the dotdot entry will + * be set to this value. If the value is NULLFSINO, then use @root_ino + * as a stand-in until the orphanage can adopt the subdirectory. + */ + xfs_ino_t parent_ino; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_parent_rec pptr_rec; + struct xfs_da_args pptr_args; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; + + /* Information for reparenting this directory. */ + struct xrep_adoption adoption; + + /* + * Hook into directory updates so that we can receive live updates + * from other writer threads. + */ + struct xfs_dir_hook dhook; + + /* Parent pointer update arguments. */ + struct xfs_parent_args ppargs; + + /* lock for everything below here */ + struct mutex lock; + + /* buffer for the live update functions to use for dirent names */ + struct xfs_name hook_xname; + unsigned char hook_namebuf[MAXNAMELEN]; + + /* + * All path steps observed during this scan. Each of the path + * steps for a particular pathwalk are recorded in sequential + * order in the xfarray. A pathwalk ends either with a step + * pointing to the root directory (success) or pointing to NULLFSINO + * (loop detected, empty dir detected, etc). + */ + struct xfarray *path_steps; + + /* All names observed during this scan. */ + struct xfblob *path_names; + + /* All paths being tracked by this scanner. */ + struct list_head path_list; + + /* Number of paths in path_list. */ + unsigned int nr_paths; + + /* Number of parents found by a pptr scan. */ + unsigned int parents_found; + + /* Have the path data been invalidated by a concurrent update? */ + bool stale:1; + + /* Has the scan been aborted? */ + bool aborted:1; +}; + +#define xchk_dirtree_for_each_path_safe(dl, path, n) \ + list_for_each_entry_safe((path), (n), &(dl)->path_list, list) + +#define xchk_dirtree_for_each_path(dl, path) \ + list_for_each_entry((path), &(dl)->path_list, list) + +static inline bool +xchk_dirtree_parentless(const struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + + if (sc->ip == sc->mp->m_rootip) + return true; + if (VFS_I(sc->ip)->i_nlink == 0) + return true; + return false; +} + +int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl); +int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip, + struct xchk_dirpath *path, const struct xfs_name *name, + const struct xfs_parent_rec *pptr); +void xchk_dirtree_evaluate(struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc); + +#endif /* __XFS_SCRUB_DIRTREE_H__ */ diff --git a/fs/xfs/scrub/dirtree_repair.c b/fs/xfs/scrub/dirtree_repair.c new file mode 100644 index 000000000000..5c04e70ba951 --- /dev/null +++ b/fs/xfs/scrub/dirtree_repair.c @@ -0,0 +1,821 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_trans_space.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/listxattr.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/orphanage.h" +#include "scrub/dirtree.h" +#include "scrub/readdir.h" + +/* + * Directory Tree Structure Repairs + * ================================ + * + * If we decide that the directory being scanned is participating in a + * directory loop, the only change we can make is to remove directory entries + * pointing down to @sc->ip. If that leaves it with no parents, the directory + * should be adopted by the orphanage. + */ + +/* Set up to repair directory loops. */ +int +xrep_setup_dirtree( + struct xfs_scrub *sc) +{ + return xrep_orphanage_try_create(sc); +} + +/* Change the outcome of this path. */ +static inline void +xrep_dirpath_set_outcome( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + enum xchk_dirpath_outcome outcome) +{ + trace_xrep_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps, + outcome); + + path->outcome = outcome; +} + +/* Delete all paths. */ +STATIC void +xrep_dirtree_delete_all_paths( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + oc->good--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 0); + ASSERT(oc->good == 0); +} + +/* Since this is the surviving path, set the dotdot entry to this value. */ +STATIC void +xrep_dirpath_retain_parent( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + int error; + + error = xfarray_load(dl->path_steps, path->first_step, &step); + if (error) + return; + + dl->parent_ino = be64_to_cpu(step.pptr_rec.p_ino); +} + +/* Find the one surviving path so we know how to set dotdot. */ +STATIC void +xrep_dirtree_find_surviving_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + case XCHK_DIRPATH_OK: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + ASSERT(foundit == false); + break; + default: + break; + } + } + + ASSERT(oc->suspect + oc->good == 1); +} + +/* Delete all paths except for the one good one. */ +STATIC void +xrep_dirtree_keep_one_good_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + oc->good--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 0); + ASSERT(oc->good < 2); +} + +/* Delete all paths except for one suspect one. */ +STATIC void +xrep_dirtree_keep_one_suspect_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + ASSERT(0); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 1); + ASSERT(oc->good == 0); +} + +/* + * Figure out what to do with the paths we tried to find. Returns -EDEADLOCK + * if the scan results have become stale. + */ +STATIC void +xrep_dirtree_decide_fate( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + xchk_dirtree_evaluate(dl, oc); + + /* Parentless directories should not have any paths at all. */ + if (xchk_dirtree_parentless(dl)) { + xrep_dirtree_delete_all_paths(dl, oc); + return; + } + + /* One path is exactly the number of paths we want. */ + if (oc->good + oc->suspect == 1) { + xrep_dirtree_find_surviving_path(dl, oc); + return; + } + + /* Zero paths means we should reattach the subdir to the orphanage. */ + if (oc->good + oc->suspect == 0) { + if (dl->sc->orphanage) + oc->needs_adoption = true; + return; + } + + /* + * Otherwise, this subdirectory has too many parents. If there's at + * least one good path, keep it and delete the others. + */ + if (oc->good > 0) { + xrep_dirtree_keep_one_good_path(dl, oc); + return; + } + + /* + * There are no good paths and there are too many suspect paths. + * Keep the first suspect path and delete the rest. + */ + xrep_dirtree_keep_one_suspect_path(dl, oc); +} + +/* + * Load the first step of this path into @step and @dl->xname/pptr + * for later repair work. + */ +STATIC int +xrep_dirtree_prep_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + struct xchk_dirpath_step *step) +{ + int error; + + error = xfarray_load(dl->path_steps, path->first_step, step); + if (error) + return error; + + error = xfblob_loadname(dl->path_names, step->name_cookie, &dl->xname, + step->name_len); + if (error) + return error; + + dl->pptr_rec = step->pptr_rec; /* struct copy */ + return 0; +} + +/* Delete the VFS dentry for a removed child. */ +STATIC int +xrep_dirtree_purge_dentry( + struct xchk_dirtree *dl, + struct xfs_inode *dp, + const struct xfs_name *name) +{ + struct qstr qname = QSTR_INIT(name->name, name->len); + struct dentry *parent_dentry, *child_dentry; + int error = 0; + + /* + * Find the dentry for the parent directory. If there isn't one, we're + * done. Caller already holds i_rwsem for parent and child. + */ + parent_dentry = d_find_alias(VFS_I(dp)); + if (!parent_dentry) + return 0; + + /* The VFS thinks the parent is a directory, right? */ + if (!d_is_dir(parent_dentry)) { + ASSERT(d_is_dir(parent_dentry)); + error = -EFSCORRUPTED; + goto out_dput_parent; + } + + /* + * Try to find the dirent pointing to the child. If there isn't one, + * we're done. + */ + qname.hash = full_name_hash(parent_dentry, name->name, name->len); + child_dentry = d_lookup(parent_dentry, &qname); + if (!child_dentry) { + error = 0; + goto out_dput_parent; + } + + trace_xrep_dirtree_delete_child(dp->i_mount, child_dentry); + + /* Child is not a directory? We're screwed. */ + if (!d_is_dir(child_dentry)) { + ASSERT(d_is_dir(child_dentry)); + error = -EFSCORRUPTED; + goto out_dput_child; + } + + /* Replace the child dentry with a negative one. */ + d_delete(child_dentry); + +out_dput_child: + dput(child_dentry); +out_dput_parent: + dput(parent_dentry); + return error; +} + +/* + * Prepare to delete a link by taking the IOLOCK of the parent and the child + * (scrub target). Caller must hold IOLOCK_EXCL on @sc->ip. Returns 0 if we + * took both locks, or a negative errno if we couldn't lock the parent in time. + */ +static inline int +xrep_dirtree_unlink_iolock( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + int error; + + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + + if (xfs_ilock_nowait(dp, XFS_IOLOCK_EXCL)) + return 0; + + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + do { + xfs_ilock(dp, XFS_IOLOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xfs_iunlock(dp, XFS_IOLOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + return error; + } + + delay(1); + } while (1); + + return 0; +} + +/* + * Remove a link from the directory tree and update the dcache. Returns + * -ESTALE if the scan data are now out of date. + */ +STATIC int +xrep_dirtree_unlink( + struct xchk_dirtree *dl, + struct xfs_inode *dp, + struct xchk_dirpath *path, + struct xchk_dirpath_step *step) +{ + struct xfs_scrub *sc = dl->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t dotdot_ino; + xfs_ino_t parent_ino = dl->parent_ino; + unsigned int resblks; + int dontcare; + int error; + + /* Take IOLOCK_EXCL of the parent and child. */ + error = xrep_dirtree_unlink_iolock(sc, dp); + if (error) + return error; + + /* + * Create the transaction that we need to sever the path. Ignore + * EDQUOT and ENOSPC being returned via nospace_error because the + * directory code can handle a reservationless update. + */ + resblks = xfs_remove_space_res(mp, step->name_len); + error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, sc->ip, + &resblks, &sc->tp, &dontcare); + if (error) + goto out_iolock; + + /* + * Cancel if someone invalidate the paths while we were trying to get + * the ILOCK. + */ + mutex_lock(&dl->lock); + if (dl->stale) { + mutex_unlock(&dl->lock); + error = -ESTALE; + goto out_trans_cancel; + } + xrep_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETING); + mutex_unlock(&dl->lock); + + trace_xrep_dirtree_delete_path(dl->sc, sc->ip, path->path_nr, + &dl->xname, &dl->pptr_rec); + + /* + * Decide if we need to reset the dotdot entry. Rules: + * + * - If there's a surviving parent, we want dotdot to point there. + * - If we don't have any surviving parents, then point dotdot at the + * root dir. + * - If dotdot is already set to the value we want, pass in NULLFSINO + * for no change necessary. + * + * Do this /before/ we dirty anything, in case the dotdot lookup + * fails. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &dotdot_ino); + if (error) + goto out_trans_cancel; + if (parent_ino == NULLFSINO) + parent_ino = dl->root_ino; + if (dotdot_ino == parent_ino) + parent_ino = NULLFSINO; + + /* Drop the link from sc->ip's dotdot entry. */ + error = xfs_droplink(sc->tp, dp); + if (error) + goto out_trans_cancel; + + /* Reset the dotdot entry to a surviving parent. */ + if (parent_ino != NULLFSINO) { + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + parent_ino, 0); + if (error) + goto out_trans_cancel; + } + + /* Drop the link from dp to sc->ip. */ + error = xfs_droplink(sc->tp, sc->ip); + if (error) + goto out_trans_cancel; + + error = xfs_dir_removename(sc->tp, dp, &dl->xname, sc->ip->i_ino, + resblks); + if (error) { + ASSERT(error != -ENOENT); + goto out_trans_cancel; + } + + if (xfs_has_parent(sc->mp)) { + error = xfs_parent_removename(sc->tp, &dl->ppargs, dp, + &dl->xname, sc->ip); + if (error) + goto out_trans_cancel; + } + + /* + * Notify dirent hooks that we removed the bad link, invalidate the + * dcache, and commit the repair. + */ + xfs_dir_update_hook(dp, sc->ip, -1, &dl->xname); + error = xrep_dirtree_purge_dentry(dl, dp, &dl->xname); + if (error) + goto out_trans_cancel; + + error = xrep_trans_commit(sc); + goto out_ilock; + +out_trans_cancel: + xchk_trans_cancel(sc); +out_ilock: + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); +out_iolock: + xfs_iunlock(dp, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Delete a directory entry that points to this directory. Returns -ESTALE + * if the scan data are now out of date. + */ +STATIC int +xrep_dirtree_delete_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + struct xfs_scrub *sc = dl->sc; + struct xfs_inode *dp; + int error; + + /* + * Load the parent pointer and directory inode for this path, then + * drop the scan lock, the ILOCK, and the transaction so that + * _delete_path can reserve the proper transaction. This sets up + * @dl->xname for the deletion. + */ + error = xrep_dirtree_prep_path(dl, path, &step); + if (error) + return error; + + error = xchk_iget(sc, be64_to_cpu(step.pptr_rec.p_ino), &dp); + if (error) + return error; + + mutex_unlock(&dl->lock); + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Delete the directory link and release the parent. */ + error = xrep_dirtree_unlink(dl, dp, path, &step); + xchk_irele(sc, dp); + + /* + * Retake all the resources we had at the beginning even if the repair + * failed or the scan data are now stale. This keeps things simple for + * the caller. + */ + xchk_trans_alloc_empty(sc); + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + + if (!error && dl->stale) + error = -ESTALE; + return error; +} + +/* Add a new path to represent our in-progress adoption. */ +STATIC int +xrep_dirtree_create_adoption_path( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + struct xchk_dirpath *path; + int error; + + /* + * We should have capped the number of paths at XFS_MAXLINK-1 in the + * scanner. + */ + if (dl->nr_paths > XFS_MAXLINK) { + ASSERT(dl->nr_paths <= XFS_MAXLINK); + return -EFSCORRUPTED; + } + + /* + * Create a new xchk_path structure to remember this parent pointer + * and record the first name step. + */ + path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS); + if (!path) + return -ENOMEM; + + INIT_LIST_HEAD(&path->list); + xino_bitmap_init(&path->seen_inodes); + path->nr_steps = 0; + path->outcome = XREP_DIRPATH_ADOPTING; + + /* + * Record the new link that we just created in the orphanage. Because + * adoption is the last repair that we perform, we don't bother filling + * in the path all the way back to the root. + */ + xfs_inode_to_parent_rec(&dl->pptr_rec, sc->orphanage); + + error = xino_bitmap_set(&path->seen_inodes, sc->orphanage->i_ino); + if (error) + goto out_path; + + trace_xrep_dirtree_create_adoption(sc, sc->ip, dl->nr_paths, + &dl->xname, &dl->pptr_rec); + + error = xchk_dirpath_append(dl, sc->ip, path, &dl->xname, + &dl->pptr_rec); + if (error) + goto out_path; + + path->first_step = xfarray_length(dl->path_steps) - 1; + path->second_step = XFARRAY_NULLIDX; + path->path_nr = dl->nr_paths; + + list_add_tail(&path->list, &dl->path_list); + dl->nr_paths++; + return 0; + +out_path: + kfree(path); + return error; +} + +/* + * Prepare to move a file to the orphanage by taking the IOLOCK of the + * orphanage and the child (scrub target). Caller must hold IOLOCK_EXCL on + * @sc->ip. Returns 0 if we took both locks, or a negative errno if we + * couldn't lock the orphanage in time. + */ +static inline int +xrep_dirtree_adopt_iolock( + struct xfs_scrub *sc) +{ + int error; + + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + + if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + return 0; + + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + do { + xrep_orphanage_ilock(sc, XFS_IOLOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + return error; + } + + delay(1); + } while (1); + + return 0; +} + +/* + * Reattach this orphaned directory to the orphanage. Do not call this with + * any resources held. Returns -ESTALE if the scan data have become out of + * date. + */ +STATIC int +xrep_dirtree_adopt( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* Take the IOLOCK of the orphanage and the scrub target. */ + error = xrep_dirtree_adopt_iolock(sc); + if (error) + return error; + + /* + * Set up for an adoption. The directory tree fixer runs after the + * link counts have been corrected. Therefore, we must bump the + * child's link count since there will be no further opportunity to fix + * errors. + */ + error = xrep_adoption_trans_alloc(sc, &dl->adoption); + if (error) + goto out_iolock; + dl->adoption.bump_child_nlink = true; + + /* Figure out what name we're going to use here. */ + error = xrep_adoption_compute_name(&dl->adoption, &dl->xname); + if (error) + goto out_trans; + + /* + * Now that we have a proposed name for the orphanage entry, create + * a faux path so that the live update hook will see it. + */ + mutex_lock(&dl->lock); + if (dl->stale) { + mutex_unlock(&dl->lock); + error = -ESTALE; + goto out_trans; + } + error = xrep_dirtree_create_adoption_path(dl); + mutex_unlock(&dl->lock); + if (error) + goto out_trans; + + /* Reparent the directory. */ + error = xrep_adoption_move(&dl->adoption); + if (error) + goto out_trans; + + /* + * Commit the name and release all inode locks except for the scrub + * target's IOLOCK. + */ + error = xrep_trans_commit(sc); + goto out_ilock; + +out_trans: + xchk_trans_cancel(sc); +out_ilock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); +out_iolock: + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return error; +} + +/* + * This newly orphaned directory needs to be adopted by the orphanage. + * Make this happen. + */ +STATIC int +xrep_dirtree_move_to_orphanage( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* + * Start by dropping all the resources that we hold so that we can grab + * all the resources that we need for the adoption. + */ + mutex_unlock(&dl->lock); + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Perform the adoption. */ + error = xrep_dirtree_adopt(dl); + + /* + * Retake all the resources we had at the beginning even if the repair + * failed or the scan data are now stale. This keeps things simple for + * the caller. + */ + xchk_trans_alloc_empty(sc); + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + + if (!error && dl->stale) + error = -ESTALE; + return error; +} + +/* + * Try to fix all the problems. Returns -ESTALE if the scan data have become + * out of date. + */ +STATIC int +xrep_dirtree_fix_problems( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + int error; + + /* Delete all the paths we don't want. */ + xchk_dirtree_for_each_path(dl, path) { + if (path->outcome != XCHK_DIRPATH_DELETE) + continue; + + error = xrep_dirtree_delete_path(dl, path); + if (error) + return error; + } + + /* Reparent this directory to the orphanage. */ + if (oc->needs_adoption) { + if (xrep_orphanage_can_adopt(dl->sc)) + return xrep_dirtree_move_to_orphanage(dl); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Fix directory loops involving this directory. */ +int +xrep_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree *dl = sc->buf; + struct xchk_dirtree_outcomes oc; + int error; + + /* + * Prepare to fix the directory tree by retaking the scan lock. The + * order of resource acquisition is still IOLOCK -> transaction -> + * ILOCK -> scan lock. + */ + mutex_lock(&dl->lock); + do { + /* + * Decide what we're going to do, then do it. An -ESTALE + * return here means the scan results are invalid and we have + * to walk again. + */ + if (!dl->stale) { + xrep_dirtree_decide_fate(dl, &oc); + + trace_xrep_dirtree_decided_fate(dl, &oc); + + error = xrep_dirtree_fix_problems(dl, &oc); + if (!error || error != -ESTALE) + break; + } + error = xchk_dirtree_find_paths_to_root(dl); + if (error == -ELNRNG || error == -ENOSR) + error = -EFSCORRUPTED; + } while (!error); + mutex_unlock(&dl->lock); + + return error; +} diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c new file mode 100644 index 000000000000..01766041ba2c --- /dev/null +++ b/fs/xfs/scrub/findparent.c @@ -0,0 +1,454 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_trans_space.h" +#include "xfs_health.h" +#include "xfs_exchmaps.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/findparent.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/listxattr.h" + +/* + * Finding the Parent of a Directory + * ================================= + * + * Directories have parent pointers, in the sense that each directory contains + * a dotdot entry that points to the single allowed parent. The brute force + * way to find the parent of a given directory is to scan every directory in + * the filesystem looking for a child dirent that references this directory. + * + * This module wraps the process of scanning the directory tree. It requires + * that @sc->ip is the directory whose parent we want to find, and that the + * caller hold only the IOLOCK on that directory. The scan itself needs to + * take the ILOCK of each directory visited. + * + * Because we cannot hold @sc->ip's ILOCK during a scan of the whole fs, it is + * necessary to use dirent hook to update the parent scan results. Callers + * must not read the scan results without re-taking @sc->ip's ILOCK. + * + * There are a few shortcuts that we can take to avoid scanning the entire + * filesystem, such as noticing directory tree roots and querying the dentry + * cache for parent information. + */ + +struct xrep_findparent_info { + /* The directory currently being scanned. */ + struct xfs_inode *dp; + + /* + * Scrub context. We're looking for a @dp containing a directory + * entry pointing to sc->ip->i_ino. + */ + struct xfs_scrub *sc; + + /* Optional scan information for a xrep_findparent_scan call. */ + struct xrep_parent_scan_info *parent_scan; + + /* + * Parent that we've found for sc->ip. If we're scanning the entire + * directory tree, we need this to ensure that we only find /one/ + * parent directory. + */ + xfs_ino_t found_parent; + + /* + * This is set to true if @found_parent was not observed directly from + * the directory scan but by noticing a change in dotdot entries after + * cycling the sc->ip IOLOCK. + */ + bool parent_tentative; +}; + +/* + * If this directory entry points to the scrub target inode, then the directory + * we're scanning is the parent of the scrub target inode. + */ +STATIC int +xrep_findparent_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_findparent_info *fpi = priv; + int error = 0; + + if (xchk_should_terminate(fpi->sc, &error)) + return error; + + if (ino != fpi->sc->ip->i_ino) + return 0; + + /* Ignore garbage directory entry names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* + * Ignore dotdot and dot entries -- we're looking for parent -> child + * links only. + */ + if (name->name[0] == '.' && (name->len == 1 || + (name->len == 2 && name->name[1] == '.'))) + return 0; + + /* Uhoh, more than one parent for a dir? */ + if (fpi->found_parent != NULLFSINO && + !(fpi->parent_tentative && fpi->found_parent == fpi->dp->i_ino)) { + trace_xrep_findparent_dirent(fpi->sc->ip, 0); + return -EFSCORRUPTED; + } + + /* We found a potential parent; remember this. */ + trace_xrep_findparent_dirent(fpi->sc->ip, fpi->dp->i_ino); + fpi->found_parent = fpi->dp->i_ino; + fpi->parent_tentative = false; + + if (fpi->parent_scan) + xrep_findparent_scan_found(fpi->parent_scan, fpi->dp->i_ino); + + return 0; +} + +/* + * If this is a directory, walk the dirents looking for any that point to the + * scrub target inode. + */ +STATIC int +xrep_findparent_walk_directory( + struct xrep_findparent_info *fpi) +{ + struct xfs_scrub *sc = fpi->sc; + struct xfs_inode *dp = fpi->dp; + unsigned int lock_mode; + int error = 0; + + /* + * The inode being scanned cannot be its own parent, nor can any + * temporary directory we created to stage this repair. + */ + if (dp == sc->ip || dp == sc->tempip) + return 0; + + /* + * Similarly, temporary files created to stage a repair cannot be the + * parent of this inode. + */ + if (xrep_is_tempfile(dp)) + return 0; + + /* + * Scan the directory to see if there it contains an entry pointing to + * the directory that we are repairing. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * If this directory is known to be sick, we cannot scan it reliably + * and must abort. + */ + if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | + XFS_SICK_INO_BMBTD | + XFS_SICK_INO_DIR)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + /* + * We cannot complete our parent pointer scan if a directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_dir_walk(sc, dp, xrep_findparent_dirent, fpi); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lock_mode); + return error; +} + +/* + * Update this directory's dotdot pointer based on ongoing dirent updates. + */ +STATIC int +xrep_findparent_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_parent_scan_info *pscan; + struct xfs_scrub *sc; + + pscan = container_of(nb, struct xrep_parent_scan_info, + dhook.dirent_hook.nb); + sc = pscan->sc; + + /* + * If @p->ip is the subdirectory that we're interested in and we've + * already scanned @p->dp, update the dotdot target inumber to the + * parent inode. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&pscan->iscan, p->dp->i_ino)) { + if (p->delta > 0) { + xrep_findparent_scan_found(pscan, p->dp->i_ino); + } else { + xrep_findparent_scan_found(pscan, NULLFSINO); + } + } + + return NOTIFY_DONE; +} + +/* + * Set up a scan to find the parent of a directory. The provided dirent hook + * will be called when there is a dotdot update for the inode being repaired. + */ +int +__xrep_findparent_scan_start( + struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan, + notifier_fn_t custom_fn) +{ + int error; + + if (!(sc->flags & XCHK_FSGATES_DIRENTS)) { + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + return -EINVAL; + } + + pscan->sc = sc; + pscan->parent_ino = NULLFSINO; + + mutex_init(&pscan->lock); + + xchk_iscan_start(sc, 30000, 100, &pscan->iscan); + + /* + * Hook into the dirent update code. The hook only operates on inodes + * that were already scanned, and the scanner thread takes each inode's + * ILOCK, which means that any in-progress inode updates will finish + * before we can scan the inode. + */ + if (custom_fn) + xfs_dir_hook_setup(&pscan->dhook, custom_fn); + else + xfs_dir_hook_setup(&pscan->dhook, xrep_findparent_live_update); + error = xfs_dir_hook_add(sc->mp, &pscan->dhook); + if (error) + goto out_iscan; + + return 0; +out_iscan: + xchk_iscan_teardown(&pscan->iscan); + mutex_destroy(&pscan->lock); + return error; +} + +/* + * Scan the entire filesystem looking for a parent inode for the inode being + * scrubbed. @sc->ip must not be the root of a directory tree. Callers must + * not hold a dirty transaction or any lock that would interfere with taking + * an ILOCK. + * + * Returns 0 with @pscan->parent_ino set to the parent that we found. + * Returns 0 with @pscan->parent_ino set to NULLFSINO if we found no parents. + * Returns the usual negative errno if something else happened. + */ +int +xrep_findparent_scan( + struct xrep_parent_scan_info *pscan) +{ + struct xrep_findparent_info fpi = { + .sc = pscan->sc, + .found_parent = NULLFSINO, + .parent_scan = pscan, + }; + struct xfs_scrub *sc = pscan->sc; + int ret; + + ASSERT(S_ISDIR(VFS_IC(sc->ip)->i_mode)); + + while ((ret = xchk_iscan_iter(&pscan->iscan, &fpi.dp)) == 1) { + if (S_ISDIR(VFS_I(fpi.dp)->i_mode)) + ret = xrep_findparent_walk_directory(&fpi); + else + ret = 0; + xchk_iscan_mark_visited(&pscan->iscan, fpi.dp); + xchk_irele(sc, fpi.dp); + if (ret) + break; + + if (xchk_should_terminate(sc, &ret)) + break; + } + xchk_iscan_iter_finish(&pscan->iscan); + + return ret; +} + +/* Tear down a parent scan. */ +void +xrep_findparent_scan_teardown( + struct xrep_parent_scan_info *pscan) +{ + xfs_dir_hook_del(pscan->sc->mp, &pscan->dhook); + xchk_iscan_teardown(&pscan->iscan); + mutex_destroy(&pscan->lock); +} + +/* Finish a parent scan early. */ +void +xrep_findparent_scan_finish_early( + struct xrep_parent_scan_info *pscan, + xfs_ino_t ino) +{ + xrep_findparent_scan_found(pscan, ino); + xchk_iscan_finish_early(&pscan->iscan); +} + +/* + * Confirm that the directory @parent_ino actually contains a directory entry + * pointing to the child @sc->ip->ino. This function returns one of several + * ways: + * + * Returns 0 with @parent_ino unchanged if the parent was confirmed. + * Returns 0 with @parent_ino set to NULLFSINO if the parent was not valid. + * Returns the usual negative errno if something else happened. + */ +int +xrep_findparent_confirm( + struct xfs_scrub *sc, + xfs_ino_t *parent_ino) +{ + struct xrep_findparent_info fpi = { + .sc = sc, + .found_parent = NULLFSINO, + }; + int error; + + /* + * The root directory always points to itself. Unlinked dirs can point + * anywhere, so we point them at the root dir too. + */ + if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) { + *parent_ino = sc->mp->m_sb.sb_rootino; + return 0; + } + + /* Reject garbage parent inode numbers and self-referential parents. */ + if (*parent_ino == NULLFSINO) + return 0; + if (!xfs_verify_dir_ino(sc->mp, *parent_ino) || + *parent_ino == sc->ip->i_ino) { + *parent_ino = NULLFSINO; + return 0; + } + + error = xchk_iget(sc, *parent_ino, &fpi.dp); + if (error) + return error; + + if (!S_ISDIR(VFS_I(fpi.dp)->i_mode)) { + *parent_ino = NULLFSINO; + goto out_rele; + } + + error = xrep_findparent_walk_directory(&fpi); + if (error) + goto out_rele; + + *parent_ino = fpi.found_parent; +out_rele: + xchk_irele(sc, fpi.dp); + return error; +} + +/* + * If we're the root of a directory tree, we are our own parent. If we're an + * unlinked directory, the parent /won't/ have a link to us. Set the parent + * directory to the root for both cases. Returns NULLFSINO if we don't know + * what to do. + */ +xfs_ino_t +xrep_findparent_self_reference( + struct xfs_scrub *sc) +{ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino) + return sc->mp->m_sb.sb_rootino; + + if (VFS_I(sc->ip)->i_nlink == 0) + return sc->mp->m_sb.sb_rootino; + + return NULLFSINO; +} + +/* Check the dentry cache to see if knows of a parent for the scrub target. */ +xfs_ino_t +xrep_findparent_from_dcache( + struct xfs_scrub *sc) +{ + struct inode *pip = NULL; + struct dentry *dentry, *parent; + xfs_ino_t ret = NULLFSINO; + + dentry = d_find_alias(VFS_I(sc->ip)); + if (!dentry) + goto out; + + parent = dget_parent(dentry); + if (!parent) + goto out_dput; + + ASSERT(parent->d_sb == sc->ip->i_mount->m_super); + + pip = igrab(d_inode(parent)); + dput(parent); + + if (S_ISDIR(pip->i_mode)) { + trace_xrep_findparent_from_dcache(sc->ip, XFS_I(pip)->i_ino); + ret = XFS_I(pip)->i_ino; + } + + xchk_irele(sc, XFS_I(pip)); + +out_dput: + dput(dentry); +out: + return ret; +} diff --git a/fs/xfs/scrub/findparent.h b/fs/xfs/scrub/findparent.h new file mode 100644 index 000000000000..d998c7a88152 --- /dev/null +++ b/fs/xfs/scrub/findparent.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FINDPARENT_H__ +#define __XFS_SCRUB_FINDPARENT_H__ + +struct xrep_parent_scan_info { + struct xfs_scrub *sc; + + /* Inode scan cursor. */ + struct xchk_iscan iscan; + + /* Hook to capture directory entry updates. */ + struct xfs_dir_hook dhook; + + /* Lock protecting parent_ino. */ + struct mutex lock; + + /* Parent inode that we've found. */ + xfs_ino_t parent_ino; + + bool lookup_parent; +}; + +int __xrep_findparent_scan_start(struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan, + notifier_fn_t custom_fn); +static inline int xrep_findparent_scan_start(struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan) +{ + return __xrep_findparent_scan_start(sc, pscan, NULL); +} +int xrep_findparent_scan(struct xrep_parent_scan_info *pscan); +void xrep_findparent_scan_teardown(struct xrep_parent_scan_info *pscan); + +static inline void +xrep_findparent_scan_found( + struct xrep_parent_scan_info *pscan, + xfs_ino_t ino) +{ + mutex_lock(&pscan->lock); + pscan->parent_ino = ino; + mutex_unlock(&pscan->lock); +} + +void xrep_findparent_scan_finish_early(struct xrep_parent_scan_info *pscan, + xfs_ino_t ino); + +int xrep_findparent_confirm(struct xfs_scrub *sc, xfs_ino_t *parent_ino); + +xfs_ino_t xrep_findparent_self_reference(struct xfs_scrub *sc); +xfs_ino_t xrep_findparent_from_dcache(struct xfs_scrub *sc); + +#endif /* __XFS_SCRUB_FINDPARENT_H__ */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index d310737c8823..1d3e98346933 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -85,7 +85,7 @@ xchk_fscount_warmup( continue; /* Lock both AG headers. */ - error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); if (error) break; error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); @@ -412,10 +412,11 @@ xchk_fscount_count_frextents( int error; fsc->frextents = 0; + fsc->frextents_delayed = 0; if (!xfs_has_realtime(mp)) return 0; - xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP); error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_fscount_add_frextent, fsc); if (error) { @@ -423,8 +424,10 @@ xchk_fscount_count_frextents( goto out_unlock; } + fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); + out_unlock: - xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP); return error; } #else @@ -434,6 +437,7 @@ xchk_fscount_count_frextents( struct xchk_fscounters *fsc) { fsc->frextents = 0; + fsc->frextents_delayed = 0; return 0; } #endif /* CONFIG_XFS_RT */ @@ -517,7 +521,7 @@ xchk_fscounters( /* * If the filesystem is not frozen, the counter summation calls above - * can race with xfs_mod_freecounter, which subtracts a requested space + * can race with xfs_dec_freecounter, which subtracts a requested space * reservation from the counter and undoes the subtraction if that made * the counter go negative. Therefore, it's possible to see negative * values here, and we should only flag that as a corruption if we @@ -593,7 +597,7 @@ xchk_fscounters( } if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, - fsc->frextents)) { + fsc->frextents - fsc->frextents_delayed)) { if (fsc->frozen) xchk_set_corrupt(sc); else diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h index 461a13d25f4b..bcf56e1c36f9 100644 --- a/fs/xfs/scrub/fscounters.h +++ b/fs/xfs/scrub/fscounters.h @@ -12,6 +12,7 @@ struct xchk_fscounters { uint64_t ifree; uint64_t fdblocks; uint64_t frextents; + uint64_t frextents_delayed; unsigned long long icount_min; unsigned long long icount_max; bool frozen; diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c index 94cdb852bee4..469bf645dbea 100644 --- a/fs/xfs/scrub/fscounters_repair.c +++ b/fs/xfs/scrub/fscounters_repair.c @@ -65,7 +65,17 @@ xrep_fscounters( percpu_counter_set(&mp->m_icount, fsc->icount); percpu_counter_set(&mp->m_ifree, fsc->ifree); percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); - percpu_counter_set(&mp->m_frextents, fsc->frextents); + + /* + * Online repair is only supported on v5 file systems, which require + * lazy sb counters and thus no update of sb_fdblocks here. But as of + * now we don't support lazy counting sb_frextents yet, and thus need + * to also update it directly here. And for that we need to keep + * track of the delalloc reservations separately, as they are are + * subtracted from m_frextents, but not included in sb_frextents. + */ + percpu_counter_set(&mp->m_frextents, + fsc->frextents - fsc->frextents_delayed); mp->m_sb.sb_frextents = fsc->frextents; return 0; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 9020a6bef7f1..b712a8bd34f5 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -108,6 +108,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, + [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE }, }; /* Return the health status mask for this scrub type. */ diff --git a/fs/xfs/scrub/ino_bitmap.h b/fs/xfs/scrub/ino_bitmap.h new file mode 100644 index 000000000000..1300833679ab --- /dev/null +++ b/fs/xfs/scrub/ino_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_INO_BITMAP_H__ +#define __XFS_SCRUB_INO_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_ino_t */ + +struct xino_bitmap { + struct xbitmap64 inobitmap; +}; + +static inline void xino_bitmap_init(struct xino_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->inobitmap); +} + +static inline void xino_bitmap_destroy(struct xino_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->inobitmap); +} + +static inline int xino_bitmap_set(struct xino_bitmap *bitmap, xfs_ino_t ino) +{ + return xbitmap64_set(&bitmap->inobitmap, ino, 1); +} + +static inline int xino_bitmap_test(struct xino_bitmap *bitmap, xfs_ino_t ino) +{ + uint64_t len = 1; + + return xbitmap64_test(&bitmap->inobitmap, ino, &len); +} + +#endif /* __XFS_SCRUB_INO_BITMAP_H__ */ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 6e2fe2d6250b..d32716fb2fec 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -739,6 +739,23 @@ xchk_inode_check_reflink_iflag( xchk_ino_set_corrupt(sc, ino); } +/* + * If this inode has zero link count, it must be on the unlinked list. If + * it has nonzero link count, it must not be on the unlinked list. + */ +STATIC void +xchk_inode_check_unlinked( + struct xfs_scrub *sc) +{ + if (VFS_I(sc->ip)->i_nlink == 0) { + if (!xfs_inode_on_unlinked_list(sc->ip)) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (xfs_inode_on_unlinked_list(sc->ip)) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } +} + /* Scrub an inode. */ int xchk_inode( @@ -771,6 +788,8 @@ xchk_inode( if (S_ISREG(VFS_I(sc->ip)->i_mode)) xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino); + xchk_inode_check_unlinked(sc); + xchk_inode_xref(sc, sc->ip->i_ino, &di); out: return error; diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index eab380e95ef4..daf9f1ee7c2c 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -46,6 +46,7 @@ #include "scrub/repair.h" #include "scrub/iscan.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" /* * Inode Record Repair @@ -282,6 +283,51 @@ xrep_dinode_findmode_dirent( return 0; } +/* Try to lock a directory, or wait a jiffy. */ +static inline int +xrep_dinode_ilock_nowait( + struct xfs_inode *dp, + unsigned int lock_mode) +{ + if (xfs_ilock_nowait(dp, lock_mode)) + return true; + + schedule_timeout_killable(1); + return false; +} + +/* + * Try to lock a directory to look for ftype hints. Since we already hold the + * AGI buffer, we cannot block waiting for the ILOCK because rename can take + * the ILOCK and then try to lock AGIs. + */ +STATIC int +xrep_dinode_trylock_directory( + struct xrep_inode *ri, + struct xfs_inode *dp, + unsigned int *lock_modep) +{ + unsigned long deadline = jiffies + msecs_to_jiffies(30000); + unsigned int lock_mode; + int error = 0; + + do { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + if (xfs_need_iread_extents(&dp->i_df)) + lock_mode = XFS_ILOCK_EXCL; + else + lock_mode = XFS_ILOCK_SHARED; + + if (xrep_dinode_ilock_nowait(dp, lock_mode)) { + *lock_modep = lock_mode; + return 0; + } + } while (!time_is_before_jiffies(deadline)); + return -EBUSY; +} + /* * If this is a directory, walk the dirents looking for any that point to the * scrub target inode. @@ -295,11 +341,17 @@ xrep_dinode_findmode_walk_directory( unsigned int lock_mode; int error = 0; + /* Ignore temporary repair directories. */ + if (xrep_is_tempfile(dp)) + return 0; + /* * Scan the directory to see if there it contains an entry pointing to * the directory that we are repairing. */ - lock_mode = xfs_ilock_data_map_shared(dp); + error = xrep_dinode_trylock_directory(ri, dp, &lock_mode); + if (error) + return error; /* * If this directory is known to be sick, we cannot scan it reliably @@ -356,6 +408,7 @@ xrep_dinode_find_mode( * so there's a real possibility that _iscan_iter can return EBUSY. */ xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); + xchk_iscan_set_agi_trylock(&ri->ftype_iscan); ri->ftype_iscan.skip_ino = sc->sm->sm_ino; ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { @@ -463,6 +516,17 @@ xrep_dinode_mode( return 0; } +/* Fix unused link count fields having nonzero values. */ +STATIC void +xrep_dinode_nlinks( + struct xfs_dinode *dip) +{ + if (dip->di_version > 1) + dip->di_onlink = 0; + else + dip->di_nlink = 0; +} + /* Fix any conflicting flags that the verifiers complain about. */ STATIC void xrep_dinode_flags( @@ -1324,6 +1388,7 @@ xrep_dinode_core( iget_error = xrep_dinode_mode(ri, dip); if (iget_error) goto write; + xrep_dinode_nlinks(dip); xrep_dinode_flags(sc, dip, ri->rt_extents > 0); xrep_dinode_size(ri, dip); xrep_dinode_extsize_hints(sc, dip); @@ -1671,6 +1736,44 @@ xrep_inode_extsize( } } +/* Ensure this file has an attr fork if it needs to hold a parent pointer. */ +STATIC int +xrep_inode_pptr( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct inode *inode = VFS_I(ip); + + if (!xfs_has_parent(mp)) + return 0; + + /* + * Unlinked inodes that cannot be added to the directory tree will not + * have a parent pointer. + */ + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + return 0; + + /* The root directory doesn't have a parent pointer. */ + if (ip == mp->m_rootip) + return 0; + + /* + * Metadata inodes are rooted in the superblock and do not have any + * parents. + */ + if (xfs_is_metadata_inode(ip)) + return 0; + + /* Inode already has an attr fork; no further work possible here. */ + if (xfs_inode_has_attr_fork(ip)) + return 0; + + return xfs_bmap_add_attrfork(sc->tp, ip, + sizeof(struct xfs_attr_sf_hdr), true); +} + /* Fix any irregularities in an inode that the verifiers don't catch. */ STATIC int xrep_inode_problems( @@ -1681,6 +1784,9 @@ xrep_inode_problems( error = xrep_inode_blockcounts(sc); if (error) return error; + error = xrep_inode_pptr(sc); + if (error) + return error; xrep_inode_timestamps(sc->ip); xrep_inode_flags(sc); xrep_inode_ids(sc); @@ -1697,6 +1803,46 @@ xrep_inode_problems( return xrep_roll_trans(sc); } +/* + * Make sure this inode's unlinked list pointers are consistent with its + * link count. + */ +STATIC int +xrep_inode_unlinked( + struct xfs_scrub *sc) +{ + unsigned int nlink = VFS_I(sc->ip)->i_nlink; + int error; + + /* + * If this inode is linked from the directory tree and on the unlinked + * list, remove it from the unlinked list. + */ + if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) { + struct xfs_perag *pag; + int error; + + pag = xfs_perag_get(sc->mp, + XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); + error = xfs_iunlink_remove(sc->tp, pag, sc->ip); + xfs_perag_put(pag); + if (error) + return error; + } + + /* + * If this inode is not linked from the directory tree yet not on the + * unlinked list, put it on the unlinked list. + */ + if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) { + error = xfs_iunlink(sc->tp, sc->ip); + if (error) + return error; + } + + return 0; +} + /* Repair an inode's fields. */ int xrep_inode( @@ -1746,5 +1892,10 @@ xrep_inode( return error; } + /* Reconnect incore unlinked list */ + error = xrep_inode_unlinked(sc); + if (error) + return error; + return xrep_defer_finish(sc); } diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index ec3478bc505e..cf9d983667ce 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -243,6 +243,51 @@ xchk_iscan_finish( mutex_unlock(&iscan->lock); } +/* Mark an inode scan finished before we actually scan anything. */ +void +xchk_iscan_finish_early( + struct xchk_iscan *iscan) +{ + ASSERT(iscan->cursor_ino == iscan->scan_start_ino); + ASSERT(iscan->__visited_ino == iscan->scan_start_ino); + + xchk_iscan_finish(iscan); +} + +/* + * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set, + * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed, + * or the usual negative errno. + */ +STATIC int +xchk_iscan_read_agi( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf **agi_bpp) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned long relax; + int ret; + + if (!xchk_iscan_agi_needs_trylock(iscan)) + return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp); + + relax = msecs_to_jiffies(iscan->iget_retry_delay); + do { + ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK, + agi_bpp); + if (ret != -EAGAIN) + return ret; + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + trace_xchk_iscan_agi_retry_wait(iscan); + } while (!schedule_timeout_killable(relax) && + !xchk_iscan_aborted(iscan)); + return -ECANCELED; +} + /* * Advance ino to the next inode that the inobt thinks is allocated, being * careful to jump to the next AG if we've reached the right end of this AG's @@ -281,7 +326,7 @@ xchk_iscan_advance( if (!pag) return -ECANCELED; - ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + ret = xchk_iscan_read_agi(iscan, pag, &agi_bp); if (ret) goto out_pag; @@ -363,6 +408,15 @@ xchk_iscan_iget_retry( } /* + * For an inode scan, we hold the AGI and want to try to grab a batch of + * inodes. Holding the AGI prevents inodegc from clearing freed inodes, + * so we must use noretry here. For every inode after the first one in the + * batch, we don't want to wait, so we use retry there too. Finally, use + * dontcache to avoid polluting the cache. + */ +#define ISCAN_IGET_FLAGS (XFS_IGET_NORETRY | XFS_IGET_DONTCACHE) + +/* * Grab an inode as part of an inode scan. While scanning this inode, the * caller must ensure that no other threads can modify the inode until a call * to xchk_iscan_visit succeeds. @@ -389,7 +443,7 @@ xchk_iscan_iget( ASSERT(iscan->__inodes[0] == NULL); /* Fill the first slot in the inode array. */ - error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, &iscan->__inodes[idx]); trace_xchk_iscan_iget(iscan, error); @@ -402,8 +456,13 @@ xchk_iscan_iget( * It's possible that this inode has lost all of its links but * hasn't yet been inactivated. If we don't have a transaction * or it's not writable, flush the inodegc workers and wait. + * If we have a non-empty transaction, we must not block on + * inodegc, which allocates its own transactions. */ - xfs_inodegc_flush(mp); + if (sc->tp && !(sc->tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) + xfs_inodegc_push(mp); + else + xfs_inodegc_flush(mp); return xchk_iscan_iget_retry(iscan, true); } @@ -457,7 +516,7 @@ xchk_iscan_iget( ASSERT(iscan->__inodes[idx] == NULL); - error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, &iscan->__inodes[idx]); if (error) break; diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h index 71f657552dfa..f9f47fa01a9e 100644 --- a/fs/xfs/scrub/iscan.h +++ b/fs/xfs/scrub/iscan.h @@ -59,6 +59,9 @@ struct xchk_iscan { /* Set if the scan has been aborted due to some event in the fs. */ #define XCHK_ISCAN_OPSTATE_ABORTED (1) +/* Use trylock to acquire the AGI */ +#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2) + static inline bool xchk_iscan_aborted(const struct xchk_iscan *iscan) { @@ -71,8 +74,21 @@ xchk_iscan_abort(struct xchk_iscan *iscan) set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); } +static inline bool +xchk_iscan_agi_needs_trylock(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + +static inline void +xchk_iscan_set_agi_trylock(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, unsigned int iget_retry_delay, struct xchk_iscan *iscan); +void xchk_iscan_finish_early(struct xchk_iscan *iscan); void xchk_iscan_teardown(struct xchk_iscan *iscan); int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp); diff --git a/fs/xfs/scrub/listxattr.c b/fs/xfs/scrub/listxattr.c new file mode 100644 index 000000000000..256ff7700c94 --- /dev/null +++ b/fs/xfs/scrub/listxattr.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" +#include "xfs_trans.h" +#include "scrub/scrub.h" +#include "scrub/bitmap.h" +#include "scrub/dab_bitmap.h" +#include "scrub/listxattr.h" + +/* Call a function for every entry in a shortform xattr structure. */ +STATIC int +xchk_xattr_walk_sf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + void *priv) +{ + struct xfs_attr_sf_hdr *hdr = ip->i_af.if_data; + struct xfs_attr_sf_entry *sfe; + unsigned int i; + int error; + + sfe = xfs_attr_sf_firstentry(hdr); + for (i = 0; i < hdr->count; i++) { + error = attr_fn(sc, ip, sfe->flags, sfe->nameval, sfe->namelen, + &sfe->nameval[sfe->namelen], sfe->valuelen, + priv); + if (error) + return error; + + sfe = xfs_attr_sf_nextentry(sfe); + } + + return 0; +} + +/* Call a function for every entry in this xattr leaf block. */ +STATIC int +xchk_xattr_walk_leaf_entries( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + struct xfs_buf *bp, + void *priv) +{ + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; + unsigned int i; + int error; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + for (i = 0; i < ichdr.count; entry++, i++) { + void *value; + unsigned char *name; + unsigned int namelen, valuelen; + + if (entry->flags & XFS_ATTR_LOCAL) { + struct xfs_attr_leaf_name_local *name_loc; + + name_loc = xfs_attr3_leaf_name_local(leaf, i); + name = name_loc->nameval; + namelen = name_loc->namelen; + value = &name_loc->nameval[name_loc->namelen]; + valuelen = be16_to_cpu(name_loc->valuelen); + } else { + struct xfs_attr_leaf_name_remote *name_rmt; + + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + name = name_rmt->name; + namelen = name_rmt->namelen; + value = NULL; + valuelen = be32_to_cpu(name_rmt->valuelen); + } + + error = attr_fn(sc, ip, entry->flags, name, namelen, value, + valuelen, priv); + if (error) + return error; + + } + + return 0; +} + +/* + * Call a function for every entry in a leaf-format xattr structure. Avoid + * memory allocations for the loop detector since there's only one block. + */ +STATIC int +xchk_xattr_walk_leaf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + void *priv) +{ + struct xfs_buf *leaf_bp; + int error; + + error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, 0, &leaf_bp); + if (error) + return error; + + error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, priv); + xfs_trans_brelse(sc->tp, leaf_bp); + return error; +} + +/* Find the leftmost leaf in the xattr dabtree. */ +STATIC int +xchk_xattr_find_leftmost_leaf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + struct xdab_bitmap *seen_dablks, + struct xfs_buf **leaf_bpp) +{ + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_buf *bp; + xfs_failaddr_t fa; + xfs_dablk_t blkno = 0; + unsigned int expected_level = 0; + int error; + + for (;;) { + xfs_extlen_t len = 1; + uint16_t magic; + + /* Make sure we haven't seen this new block already. */ + if (xdab_bitmap_test(seen_dablks, blkno, &len)) + return -EFSCORRUPTED; + + error = xfs_da3_node_read(tp, ip, blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + + error = -EFSCORRUPTED; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) + goto out_buf; + + fa = xfs_da3_node_header_check(bp, ip->i_ino); + if (fa) + goto out_buf; + + xfs_da3_node_hdr_from_disk(mp, &nodehdr, node); + + if (nodehdr.count == 0 || nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + goto out_buf; + + /* Check the level from the root node. */ + if (blkno == 0) + expected_level = nodehdr.level - 1; + else if (expected_level != nodehdr.level) + goto out_buf; + else + expected_level--; + + /* Remember that we've seen this node. */ + error = xdab_bitmap_set(seen_dablks, blkno, 1); + if (error) + goto out_buf; + + /* Find the next level towards the leaves of the dabtree. */ + btree = nodehdr.btree; + blkno = be32_to_cpu(btree->before); + xfs_trans_brelse(tp, bp); + } + + error = -EFSCORRUPTED; + fa = xfs_attr3_leaf_header_check(bp, ip->i_ino); + if (fa) + goto out_buf; + + if (expected_level != 0) + goto out_buf; + + /* Remember that we've seen this leaf. */ + error = xdab_bitmap_set(seen_dablks, blkno, 1); + if (error) + goto out_buf; + + *leaf_bpp = bp; + return 0; + +out_buf: + xfs_trans_brelse(tp, bp); + return error; +} + +/* Call a function for every entry in a node-format xattr structure. */ +STATIC int +xchk_xattr_walk_node( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + xchk_xattrleaf_fn leaf_fn, + void *priv) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xdab_bitmap seen_dablks; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf; + struct xfs_buf *leaf_bp; + int error; + + xdab_bitmap_init(&seen_dablks); + + error = xchk_xattr_find_leftmost_leaf(sc, ip, &seen_dablks, &leaf_bp); + if (error) + goto out_bitmap; + + for (;;) { + xfs_extlen_t len; + + error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, + priv); + if (error) + goto out_leaf; + + /* Find the right sibling of this leaf block. */ + leaf = leaf_bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + if (leafhdr.forw == 0) + goto out_leaf; + + xfs_trans_brelse(sc->tp, leaf_bp); + + if (leaf_fn) { + error = leaf_fn(sc, priv); + if (error) + goto out_bitmap; + } + + /* Make sure we haven't seen this new leaf already. */ + len = 1; + if (xdab_bitmap_test(&seen_dablks, leafhdr.forw, &len)) { + error = -EFSCORRUPTED; + goto out_bitmap; + } + + error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, + leafhdr.forw, &leaf_bp); + if (error) + goto out_bitmap; + + /* Remember that we've seen this new leaf. */ + error = xdab_bitmap_set(&seen_dablks, leafhdr.forw, 1); + if (error) + goto out_leaf; + } + +out_leaf: + xfs_trans_brelse(sc->tp, leaf_bp); +out_bitmap: + xdab_bitmap_destroy(&seen_dablks); + return error; +} + +/* + * Call a function for every extended attribute in a file. + * + * Callers must hold the ILOCK. No validation or cursor restarts allowed. + * Returns -EFSCORRUPTED on any problem, including loops in the dabtree. + */ +int +xchk_xattr_walk( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + xchk_xattrleaf_fn leaf_fn, + void *priv) +{ + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + if (!xfs_inode_hasattr(ip)) + return 0; + + if (ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) + return xchk_xattr_walk_sf(sc, ip, attr_fn, priv); + + /* attr functions require that the attr fork is loaded */ + error = xfs_iread_extents(sc->tp, ip, XFS_ATTR_FORK); + if (error) + return error; + + if (xfs_attr_is_leaf(ip)) + return xchk_xattr_walk_leaf(sc, ip, attr_fn, priv); + + return xchk_xattr_walk_node(sc, ip, attr_fn, leaf_fn, priv); +} diff --git a/fs/xfs/scrub/listxattr.h b/fs/xfs/scrub/listxattr.h new file mode 100644 index 000000000000..703cfb7b14cf --- /dev/null +++ b/fs/xfs/scrub/listxattr.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_LISTXATTR_H__ +#define __XFS_SCRUB_LISTXATTR_H__ + +typedef int (*xchk_xattr_fn)(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int attr_flags, const unsigned char *name, + unsigned int namelen, const void *value, unsigned int valuelen, + void *priv); + +typedef int (*xchk_xattrleaf_fn)(struct xfs_scrub *sc, void *priv); + +int xchk_xattr_walk(struct xfs_scrub *sc, struct xfs_inode *ip, + xchk_xattr_fn attr_fn, xchk_xattrleaf_fn leaf_fn, void *priv); + +#endif /* __XFS_SCRUB_LISTXATTR_H__ */ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 8a7d9557897c..80aee30886c4 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -18,15 +18,19 @@ #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_ag.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" +#include "scrub/orphanage.h" #include "scrub/nlinks.h" #include "scrub/trace.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/listxattr.h" /* * Live Inode Link Count Checking @@ -43,11 +47,23 @@ int xchk_setup_nlinks( struct xfs_scrub *sc) { + struct xchk_nlink_ctrs *xnc; + int error; + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); - sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); - if (!sc->buf) + if (xchk_could_repair(sc)) { + error = xrep_setup_nlinks(sc); + if (error) + return error; + } + + xnc = kvzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); + if (!xnc) return -ENOMEM; + xnc->xname.name = xnc->namebuf; + xnc->sc = sc; + sc->buf = xnc; return xchk_setup_fs(sc); } @@ -152,6 +168,13 @@ xchk_nlinks_live_update( xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb); + /* + * Ignore temporary directories being used to stage dir repairs, since + * we don't bump the link counts of the children. + */ + if (xrep_is_tempfile(p->dp)) + return NOTIFY_DONE; + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino, p->delta, p->name->name, p->name->len); @@ -251,12 +274,17 @@ xchk_nlinks_collect_dirent( * number of parents of the root directory. * * Otherwise, increment the number of backrefs pointing back to ino. + * + * If the filesystem has parent pointers, we walk the pptrs to + * determine the backref count. */ if (dotdot) { if (dp == sc->mp->m_rootip) error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); - else + else if (!xfs_has_parent(sc->mp)) error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); + else + error = 0; if (error) goto out_unlock; } @@ -293,6 +321,61 @@ out_incomplete: return error; } +/* Bump the backref count for the inode referenced by this parent pointer. */ +STATIC int +xchk_nlinks_collect_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_nlink_ctrs *xnc = priv; + const struct xfs_parent_rec *pptr_rec = value; + xfs_ino_t parent_ino; + int error; + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + trace_xchk_nlinks_collect_pptr(sc->mp, ip, &xname, pptr_rec); + + mutex_lock(&xnc->lock); + + error = xchk_nlinks_update_incore(xnc, parent_ino, 0, 1, 0); + if (error) + goto out_unlock; + + mutex_unlock(&xnc->lock); + return 0; + +out_unlock: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(sc); + return error; +} + /* Walk a directory to bump the observed link counts of the children. */ STATIC int xchk_nlinks_collect_dir( @@ -303,6 +386,13 @@ xchk_nlinks_collect_dir( unsigned int lock_mode; int error = 0; + /* + * Ignore temporary directories being used to stage dir repairs, since + * we don't bump the link counts of the children. + */ + if (xrep_is_tempfile(dp)) + return 0; + /* Prevent anyone from changing this directory while we walk it. */ xfs_ilock(dp, XFS_IOLOCK_SHARED); lock_mode = xfs_ilock_data_map_shared(dp); @@ -332,6 +422,28 @@ xchk_nlinks_collect_dir( if (error) goto out_abort; + /* Walk the parent pointers to get real backref counts. */ + if (xfs_has_parent(sc->mp)) { + /* + * If the extended attributes look as though they has been + * zapped by the inode record repair code, we cannot scan for + * parent pointers. + */ + if (xchk_pptr_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_xattr_walk(sc, dp, xchk_nlinks_collect_pptr, NULL, + xnc); + if (error == -ECANCELED) { + error = 0; + goto out_unlock; + } + if (error) + goto out_abort; + } + xchk_iscan_mark_visited(&xnc->collect_iscan, dp); goto out_unlock; @@ -537,6 +649,14 @@ xchk_nlinks_compare_inode( unsigned int actual_nlink; int error; + /* + * Ignore temporary files being used to stage repairs, since we assume + * they're correct for non-directories, and the directory repair code + * doesn't bump the link counts for the children. + */ + if (xrep_is_tempfile(ip)) + return 0; + xfs_ilock(ip, XFS_ILOCK_SHARED); mutex_lock(&xnc->lock); @@ -571,9 +691,11 @@ xchk_nlinks_compare_inode( * this as a corruption. The VFS won't let users increase the link * count, but it will let them decrease it. */ - if (total_links > XFS_MAXLINK) { + if (total_links > XFS_NLINK_PINNED) { xchk_ino_set_corrupt(sc, ip->i_ino); goto out_corrupt; + } else if (total_links > XFS_MAXLINK) { + xchk_ino_set_warning(sc, ip->i_ino); } /* Link counts should match. */ @@ -850,9 +972,6 @@ xchk_nlinks_setup_scan( xfs_agino_t first_agino, last_agino; int error; - ASSERT(xnc->sc == NULL); - xnc->sc = sc; - mutex_init(&xnc->lock); /* Retry iget every tenth of a second for up to 30 seconds. */ diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h index a950f3daf204..b820712bfd87 100644 --- a/fs/xfs/scrub/nlinks.h +++ b/fs/xfs/scrub/nlinks.h @@ -28,6 +28,13 @@ struct xchk_nlink_ctrs { * from other writer threads. */ struct xfs_dir_hook dhook; + + /* Orphanage reparenting request. */ + struct xrep_adoption adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; }; /* diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c index b87618322f55..b3e707f47b7b 100644 --- a/fs/xfs/scrub/nlinks_repair.c +++ b/fs/xfs/scrub/nlinks_repair.c @@ -17,14 +17,19 @@ #include "xfs_iwalk.h" #include "xfs_ialloc.h" #include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" +#include "scrub/orphanage.h" #include "scrub/nlinks.h" #include "scrub/trace.h" +#include "scrub/tempfile.h" /* * Live Inode Link Count Repair @@ -36,6 +41,48 @@ * inode is locked. */ +/* Set up to repair inode link counts. */ +int +xrep_setup_nlinks( + struct xfs_scrub *sc) +{ + return xrep_orphanage_try_create(sc); +} + +/* + * Inodes that aren't the root directory or the orphanage, have a nonzero link + * count, and no observed parents should be moved to the orphanage. + */ +static inline bool +xrep_nlinks_is_orphaned( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int actual_nlink, + const struct xchk_nlink *obs) +{ + struct xfs_mount *mp = ip->i_mount; + + if (obs->parents != 0) + return false; + if (ip == mp->m_rootip || ip == sc->orphanage) + return false; + return actual_nlink != 0; +} + +/* Remove an inode from the unlinked list. */ +STATIC int +xrep_nlinks_iunlink_remove( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag; + int error; + + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); + error = xfs_iunlink_remove(sc->tp, pag, sc->ip); + xfs_perag_put(pag); + return error; +} + /* * Correct the link count of the given inode. Because we have to grab locks * and resources in a certain order, it's possible that this will be a no-op. @@ -50,17 +97,55 @@ xrep_nlinks_repair_inode( struct xfs_inode *ip = sc->ip; uint64_t total_links; uint64_t actual_nlink; + bool orphanage_available = false; bool dirty = false; int error; - xchk_ilock(sc, XFS_IOLOCK_EXCL); + /* + * Ignore temporary files being used to stage repairs, since we assume + * they're correct for non-directories, and the directory repair code + * doesn't bump the link counts for the children. + */ + if (xrep_is_tempfile(ip)) + return 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp); - if (error) - return error; + /* + * If the filesystem has an orphanage attached to the scrub context, + * prepare for a link count repair that could involve @ip being adopted + * by the lost+found. + */ + if (xrep_orphanage_can_adopt(sc)) { + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; - xchk_ilock(sc, XFS_ILOCK_EXCL); - xfs_trans_ijoin(sc->tp, ip, 0); + error = xrep_adoption_trans_alloc(sc, &xnc->adoption); + if (error) { + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } else { + orphanage_available = true; + } + } + + /* + * Either there is no orphanage or we couldn't allocate resources for + * that kind of update. Let's try again with only the resources we + * need for a simple link count update, since that's much more common. + */ + if (!orphanage_available) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, + &sc->tp); + if (error) { + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + return error; + } + + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_trans_ijoin(sc->tp, ip, 0); + } mutex_lock(&xnc->lock); @@ -99,28 +184,68 @@ xrep_nlinks_repair_inode( } /* - * We did not find any links to this inode. If the inode agrees, we - * have nothing further to do. If not, the inode has a nonzero link - * count and we don't have anywhere to graft the child onto. Dropping - * a live inode's link count to zero can cause unexpected shutdowns in - * inactivation, so leave it alone. + * Decide if we're going to move this file to the orphanage, and fix + * up the incore link counts if we are. */ - if (total_links == 0) { - if (actual_nlink != 0) - trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); - goto out_trans; + if (orphanage_available && + xrep_nlinks_is_orphaned(sc, ip, actual_nlink, &obs)) { + /* Figure out what name we're going to use here. */ + error = xrep_adoption_compute_name(&xnc->adoption, &xnc->xname); + if (error) + goto out_trans; + + /* + * Reattach this file to the directory tree by moving it to + * the orphanage per the adoption parameters that we already + * computed. + */ + error = xrep_adoption_move(&xnc->adoption); + if (error) + goto out_trans; + + /* + * Re-read the link counts since the reparenting will have + * updated our scan info. + */ + mutex_lock(&xnc->lock); + error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs); + mutex_unlock(&xnc->lock); + if (error) + goto out_trans; + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + dirty = true; } - /* Commit the new link count if it changed. */ - if (total_links != actual_nlink) { - if (total_links > XFS_MAXLINK) { - trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + /* + * If this inode is linked from the directory tree and on the unlinked + * list, remove it from the unlinked list. + */ + if (total_links > 0 && xfs_inode_on_unlinked_list(ip)) { + error = xrep_nlinks_iunlink_remove(sc); + if (error) goto out_trans; - } + dirty = true; + } + /* + * If this inode is not linked from the directory tree yet not on the + * unlinked list, put it on the unlinked list. + */ + if (total_links == 0 && !xfs_inode_on_unlinked_list(ip)) { + error = xfs_iunlink(sc->tp, ip); + if (error) + goto out_trans; + dirty = true; + } + + /* Commit the new link count if it changed. */ + if (total_links != actual_nlink) { trace_xrep_nlinks_update_inode(mp, ip, &obs); - set_nlink(VFS_I(ip), total_links); + set_nlink(VFS_I(ip), min_t(unsigned long long, total_links, + XFS_NLINK_PINNED)); dirty = true; } @@ -132,14 +257,19 @@ xrep_nlinks_repair_inode( xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); error = xrep_trans_commit(sc); - xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - return error; + goto out_unlock; out_scanlock: mutex_unlock(&xnc->lock); out_trans: xchk_trans_cancel(sc); - xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out_unlock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (orphanage_available) { + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } + xchk_iunlock(sc, XFS_IOLOCK_EXCL); return error; } @@ -172,10 +302,10 @@ xrep_nlinks( /* * We need ftype for an accurate count of the number of child * subdirectory links. Child subdirectories with a back link (dotdot - * entry) but no forward link are unfixable, so we cannot repair the - * link count of the parent directory based on the back link count - * alone. Filesystems without ftype support are rare (old V4) so we - * just skip out here. + * entry) but no forward link are moved to the orphanage, so we cannot + * repair the link count of the parent directory based on the back link + * count alone. Filesystems without ftype support are rare (old V4) so + * we just skip out here. */ if (!xfs_has_ftype(sc->mp)) return -EOPNOTSUPP; diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c new file mode 100644 index 000000000000..7148d8362db8 --- /dev/null +++ b/fs/xfs/scrub/orphanage.c @@ -0,0 +1,627 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_dir2.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_parent.h" +#include "xfs_attr_sf.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/orphanage.h" +#include "scrub/readdir.h" + +#include <linux/namei.h> + +/* + * The Orphanage + * ============= + * + * If the directory tree is damaged, children of that directory become + * inaccessible via that file path. If a child has no other parents, the file + * is said to be orphaned. xfs_repair fixes this situation by creating a + * orphanage directory (specifically, /lost+found) and creating a directory + * entry pointing to the orphaned file. + * + * Online repair follows this tactic by creating a root-owned /lost+found + * directory if one does not exist. If an orphan is found, it will move that + * files into orphanage. + */ + +/* Make the orphanage owned by root. */ +STATIC int +xrep_chown_orphanage( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = sc->mp; + struct xfs_dquot *udqp = NULL, *gdqp = NULL, *pdqp = NULL; + struct xfs_dquot *oldu = NULL, *oldg = NULL, *oldp = NULL; + struct inode *inode = VFS_I(dp); + int error; + + error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp); + if (error) + return error; + + error = xfs_trans_alloc_ichange(dp, udqp, gdqp, pdqp, true, &tp); + if (error) + goto out_dqrele; + + /* + * Always clear setuid/setgid/sticky on the orphanage since we don't + * normally want that functionality on this directory and xfs_repair + * doesn't create it this way either. Leave the other access bits + * unchanged. + */ + inode->i_mode &= ~(S_ISUID | S_ISGID | S_ISVTX); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) { + if (XFS_IS_UQUOTA_ON(mp)) + oldu = xfs_qm_vop_chown(tp, dp, &dp->i_udquot, udqp); + inode->i_uid = GLOBAL_ROOT_UID; + } + if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) { + if (XFS_IS_GQUOTA_ON(mp)) + oldg = xfs_qm_vop_chown(tp, dp, &dp->i_gdquot, gdqp); + inode->i_gid = GLOBAL_ROOT_GID; + } + if (dp->i_projid != 0) { + if (XFS_IS_PQUOTA_ON(mp)) + oldp = xfs_qm_vop_chown(tp, dp, &dp->i_pdquot, pdqp); + dp->i_projid = 0; + } + + dp->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + XFS_STATS_INC(mp, xs_ig_attrchg); + + if (xfs_has_wsync(mp)) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + + xfs_qm_dqrele(oldu); + xfs_qm_dqrele(oldg); + xfs_qm_dqrele(oldp); + +out_dqrele: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + return error; +} + +#define ORPHANAGE "lost+found" + +/* Create the orphanage directory, and set sc->orphanage to it. */ +int +xrep_orphanage_create( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct dentry *root_dentry, *orphanage_dentry; + struct inode *root_inode = VFS_I(sc->mp->m_rootip); + struct inode *orphanage_inode; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + if (xfs_is_readonly(mp)) { + sc->orphanage = NULL; + return 0; + } + + ASSERT(sc->tp == NULL); + ASSERT(sc->orphanage == NULL); + + /* Find the dentry for the root directory... */ + root_dentry = d_find_alias(root_inode); + if (!root_dentry) { + error = -EFSCORRUPTED; + goto out; + } + + /* ...which is a directory, right? */ + if (!d_is_dir(root_dentry)) { + error = -EFSCORRUPTED; + goto out_dput_root; + } + + /* Try to find the orphanage directory. */ + inode_lock_nested(root_inode, I_MUTEX_PARENT); + orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry, + strlen(ORPHANAGE)); + if (IS_ERR(orphanage_dentry)) { + error = PTR_ERR(orphanage_dentry); + goto out_unlock_root; + } + + /* + * Nothing found? Call mkdir to create the orphanage. Create the + * directory without other-user access because we're live and someone + * could have been relying partly on minimal access to a parent + * directory to control access to a file we put in here. + */ + if (d_really_is_negative(orphanage_dentry)) { + error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry, + 0750); + if (error) + goto out_dput_orphanage; + } + + /* Not a directory? Bail out. */ + if (!d_is_dir(orphanage_dentry)) { + error = -ENOTDIR; + goto out_dput_orphanage; + } + + /* + * Grab a reference to the orphanage. This /should/ succeed since + * we hold the root directory locked and therefore nobody can delete + * the orphanage. + */ + orphanage_inode = igrab(d_inode(orphanage_dentry)); + if (!orphanage_inode) { + error = -ENOENT; + goto out_dput_orphanage; + } + + /* Make sure the orphanage is owned by root. */ + error = xrep_chown_orphanage(sc, XFS_I(orphanage_inode)); + if (error) + goto out_dput_orphanage; + + /* Stash the reference for later and bail out. */ + sc->orphanage = XFS_I(orphanage_inode); + sc->orphanage_ilock_flags = 0; + +out_dput_orphanage: + dput(orphanage_dentry); +out_unlock_root: + inode_unlock(VFS_I(sc->mp->m_rootip)); +out_dput_root: + dput(root_dentry); +out: + return error; +} + +void +xrep_orphanage_ilock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + sc->orphanage_ilock_flags |= ilock_flags; + xfs_ilock(sc->orphanage, ilock_flags); +} + +bool +xrep_orphanage_ilock_nowait( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + if (xfs_ilock_nowait(sc->orphanage, ilock_flags)) { + sc->orphanage_ilock_flags |= ilock_flags; + return true; + } + + return false; +} + +void +xrep_orphanage_iunlock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + xfs_iunlock(sc->orphanage, ilock_flags); + sc->orphanage_ilock_flags &= ~ilock_flags; +} + +/* Grab the IOLOCK of the orphanage and sc->ip. */ +int +xrep_orphanage_iolock_two( + struct xfs_scrub *sc) +{ + int error = 0; + + while (true) { + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Normal XFS takes the IOLOCK before grabbing a transaction. + * Scrub holds a transaction, which means that we can't block + * on either IOLOCK. + */ + if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } + delay(1); + } + + return 0; +} + +/* Release the orphanage. */ +void +xrep_orphanage_rele( + struct xfs_scrub *sc) +{ + if (!sc->orphanage) + return; + + if (sc->orphanage_ilock_flags) + xfs_iunlock(sc->orphanage, sc->orphanage_ilock_flags); + + xchk_irele(sc, sc->orphanage); + sc->orphanage = NULL; +} + +/* Adoption moves a file into /lost+found */ + +/* Can the orphanage adopt @sc->ip? */ +bool +xrep_orphanage_can_adopt( + struct xfs_scrub *sc) +{ + ASSERT(sc->ip != NULL); + + if (!sc->orphanage) + return false; + if (sc->ip == sc->orphanage) + return false; + if (xfs_internal_inum(sc->mp, sc->ip->i_ino)) + return false; + return true; +} + +/* + * Create a new transaction to send a child to the orphanage. + * + * Allocate a new transaction with sufficient disk space to handle the + * adoption, take ILOCK_EXCL of the orphanage and sc->ip, joins them to the + * transaction, and reserve quota to reparent the latter. Caller must hold the + * IOLOCK of the orphanage and sc->ip. + */ +int +xrep_adoption_trans_alloc( + struct xfs_scrub *sc, + struct xrep_adoption *adopt) +{ + struct xfs_mount *mp = sc->mp; + unsigned int child_blkres = 0; + int error; + + ASSERT(sc->tp == NULL); + ASSERT(sc->ip != NULL); + ASSERT(sc->orphanage != NULL); + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + ASSERT(sc->orphanage_ilock_flags & XFS_IOLOCK_EXCL); + ASSERT(!(sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); + ASSERT(!(sc->orphanage_ilock_flags & + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); + + /* Compute the worst case space reservation that we need. */ + adopt->sc = sc; + adopt->orphanage_blkres = xfs_link_space_res(mp, MAXNAMELEN); + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + child_blkres = xfs_rename_space_res(mp, 0, false, + xfs_name_dotdot.len, false); + if (xfs_has_parent(mp)) + child_blkres += XFS_ADDAFORK_SPACE_RES(mp); + adopt->child_blkres = child_blkres; + + /* + * Allocate a transaction to link the child into the parent, along with + * enough disk space to handle expansion of both the orphanage and the + * dotdot entry of a child directory. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, + adopt->orphanage_blkres + adopt->child_blkres, 0, 0, + &sc->tp); + if (error) + return error; + + xfs_lock_two_inodes(sc->orphanage, XFS_ILOCK_EXCL, + sc->ip, XFS_ILOCK_EXCL); + sc->ilock_flags |= XFS_ILOCK_EXCL; + sc->orphanage_ilock_flags |= XFS_ILOCK_EXCL; + + xfs_trans_ijoin(sc->tp, sc->orphanage, 0); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* + * Reserve enough quota in the orphan directory to add the new name. + * Normally the orphanage should have user/group/project ids of zero + * and hence is not subject to quota enforcement, but we're allowed to + * exceed quota to reattach disconnected parts of the directory tree. + */ + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->orphanage, + adopt->orphanage_blkres, 0, true); + if (error) + goto out_cancel; + + /* + * Reserve enough quota in the child directory to change dotdot. + * Here we're also allowed to exceed file quota to repair inconsistent + * metadata. + */ + if (adopt->child_blkres) { + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->ip, + adopt->child_blkres, 0, true); + if (error) + goto out_cancel; + } + + return 0; +out_cancel: + xchk_trans_cancel(sc); + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + return error; +} + +/* + * Compute the xfs_name for the directory entry that we're adding to the + * orphanage. Caller must hold ILOCKs of sc->ip and the orphanage and must not + * reuse namebuf until the adoption completes or is dissolved. + */ +int +xrep_adoption_compute_name( + struct xrep_adoption *adopt, + struct xfs_name *xname) +{ + struct xfs_scrub *sc = adopt->sc; + char *namebuf = (void *)xname->name; + xfs_ino_t ino; + unsigned int incr = 0; + int error = 0; + + adopt->xname = xname; + xname->len = snprintf(namebuf, MAXNAMELEN, "%llu", sc->ip->i_ino); + xname->type = xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode); + + /* Make sure the filename is unique in the lost+found. */ + error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino); + while (error == 0 && incr < 10000) { + xname->len = snprintf(namebuf, MAXNAMELEN, "%llu.%u", + sc->ip->i_ino, ++incr); + error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino); + } + if (error == 0) { + /* We already have 10,000 entries in the orphanage? */ + return -EFSCORRUPTED; + } + + if (error != -ENOENT) + return error; + return 0; +} + +/* + * Make sure the dcache does not have a positive dentry for the name we've + * chosen. The caller should have checked with the ondisk directory, so any + * discrepancy is a sign that something is seriously wrong. + */ +static int +xrep_adoption_check_dcache( + struct xrep_adoption *adopt) +{ + struct qstr qname = QSTR_INIT(adopt->xname->name, + adopt->xname->len); + struct xfs_scrub *sc = adopt->sc; + struct dentry *d_orphanage, *d_child; + int error = 0; + + d_orphanage = d_find_alias(VFS_I(sc->orphanage)); + if (!d_orphanage) + return 0; + + d_child = d_hash_and_lookup(d_orphanage, &qname); + if (d_child) { + trace_xrep_adoption_check_child(sc->mp, d_child); + + if (d_is_positive(d_child)) { + ASSERT(d_is_negative(d_child)); + error = -EFSCORRUPTED; + } + + dput(d_child); + } + + dput(d_orphanage); + return error; +} + +/* + * Invalidate all dentries for the name that was added to the orphanage + * directory, and all dentries pointing to the child inode that was moved. + * + * There should not be any positive entries for the name, since we've + * maintained our lock on the orphanage directory. + */ +static void +xrep_adoption_zap_dcache( + struct xrep_adoption *adopt) +{ + struct qstr qname = QSTR_INIT(adopt->xname->name, + adopt->xname->len); + struct xfs_scrub *sc = adopt->sc; + struct dentry *d_orphanage, *d_child; + + /* Invalidate all dentries for the adoption name */ + d_orphanage = d_find_alias(VFS_I(sc->orphanage)); + if (!d_orphanage) + return; + + d_child = d_hash_and_lookup(d_orphanage, &qname); + while (d_child != NULL) { + trace_xrep_adoption_invalidate_child(sc->mp, d_child); + + ASSERT(d_is_negative(d_child)); + d_invalidate(d_child); + dput(d_child); + d_child = d_lookup(d_orphanage, &qname); + } + + dput(d_orphanage); + + /* Invalidate all the dentries pointing down to this file. */ + while ((d_child = d_find_alias(VFS_I(sc->ip))) != NULL) { + trace_xrep_adoption_invalidate_child(sc->mp, d_child); + + d_invalidate(d_child); + dput(d_child); + } +} + +/* + * If we have to add an attr fork ahead of a parent pointer update, how much + * space should we ask for? + */ +static inline int +xrep_adoption_attr_sizeof( + const struct xrep_adoption *adopt) +{ + return sizeof(struct xfs_attr_sf_hdr) + + xfs_attr_sf_entsize_byname(sizeof(struct xfs_parent_rec), + adopt->xname->len); +} + +/* + * Move the current file to the orphanage under the computed name. + * + * Returns with a dirty transaction so that the caller can handle any other + * work, such as fixing up unlinked lists or resetting link counts. + */ +int +xrep_adoption_move( + struct xrep_adoption *adopt) +{ + struct xfs_scrub *sc = adopt->sc; + bool isdir = S_ISDIR(VFS_I(sc->ip)->i_mode); + int error; + + trace_xrep_adoption_reparent(sc->orphanage, adopt->xname, + sc->ip->i_ino); + + error = xrep_adoption_check_dcache(adopt); + if (error) + return error; + + /* + * If this filesystem has parent pointers, ensure that the file being + * moved to the orphanage has an attribute fork. This is required + * because the parent pointer code does not itself add attr forks. + */ + if (!xfs_inode_has_attr_fork(sc->ip) && xfs_has_parent(sc->mp)) { + int sf_size = xrep_adoption_attr_sizeof(adopt); + + error = xfs_bmap_add_attrfork(sc->tp, sc->ip, sf_size, true); + if (error) + return error; + } + + /* Create the new name in the orphanage. */ + error = xfs_dir_createname(sc->tp, sc->orphanage, adopt->xname, + sc->ip->i_ino, adopt->orphanage_blkres); + if (error) + return error; + + /* + * Bump the link count of the orphanage if we just added a + * subdirectory, and update its timestamps. + */ + xfs_trans_ichgtime(sc->tp, sc->orphanage, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + if (isdir) + xfs_bumplink(sc->tp, sc->orphanage); + xfs_trans_log_inode(sc->tp, sc->orphanage, XFS_ILOG_CORE); + + /* Bump the link count of the child. */ + if (adopt->bump_child_nlink) { + xfs_bumplink(sc->tp, sc->ip); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Replace the dotdot entry if the child is a subdirectory. */ + if (isdir) { + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + sc->orphanage->i_ino, adopt->child_blkres); + if (error) + return error; + } + + /* Add a parent pointer from the file back to the lost+found. */ + if (xfs_has_parent(sc->mp)) { + error = xfs_parent_addname(sc->tp, &adopt->ppargs, + sc->orphanage, adopt->xname, sc->ip); + if (error) + return error; + } + + /* + * Notify dirent hooks that we moved the file to /lost+found, and + * finish all the deferred work so that we know the adoption is fully + * recorded in the log. + */ + xfs_dir_update_hook(sc->orphanage, sc->ip, 1, adopt->xname); + + /* Remove negative dentries from the lost+found's dcache */ + xrep_adoption_zap_dcache(adopt); + return 0; +} + +/* + * Roll to a clean scrub transaction so that we can release the orphanage, + * even if xrep_adoption_move was not called. + * + * Commits all the work and deferred ops attached to an adoption request and + * rolls to a clean scrub transaction. On success, returns 0 with the scrub + * context holding a clean transaction with no inodes joined. On failure, + * returns negative errno with no scrub transaction. All inode locks are + * still held after this function returns. + */ +int +xrep_adoption_trans_roll( + struct xrep_adoption *adopt) +{ + struct xfs_scrub *sc = adopt->sc; + int error; + + trace_xrep_adoption_trans_roll(sc->orphanage, sc->ip, + !!(sc->tp->t_flags & XFS_TRANS_DIRTY)); + + /* Finish all the deferred ops to commit all repairs. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + /* Roll the transaction once more to detach the inodes. */ + return xfs_trans_roll(&sc->tp); +} diff --git a/fs/xfs/scrub/orphanage.h b/fs/xfs/scrub/orphanage.h new file mode 100644 index 000000000000..7c7a2e7d81db --- /dev/null +++ b/fs/xfs/scrub/orphanage.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ORPHANAGE_H__ +#define __XFS_SCRUB_ORPHANAGE_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_orphanage_create(struct xfs_scrub *sc); + +/* + * If we're doing a repair, ensure that the orphanage exists and attach it to + * the scrub context. + */ +static inline int +xrep_orphanage_try_create( + struct xfs_scrub *sc) +{ + int error; + + ASSERT(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR); + + error = xrep_orphanage_create(sc); + switch (error) { + case 0: + case -ENOENT: + case -ENOTDIR: + case -ENOSPC: + /* + * If the orphanage can't be found or isn't a directory, we'll + * keep going, but we won't be able to attach the file to the + * orphanage if we can't find the parent. + */ + return 0; + } + + return error; +} + +int xrep_orphanage_iolock_two(struct xfs_scrub *sc); + +void xrep_orphanage_ilock(struct xfs_scrub *sc, unsigned int ilock_flags); +bool xrep_orphanage_ilock_nowait(struct xfs_scrub *sc, + unsigned int ilock_flags); +void xrep_orphanage_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); + +void xrep_orphanage_rele(struct xfs_scrub *sc); + +/* Information about a request to add a file to the orphanage. */ +struct xrep_adoption { + struct xfs_scrub *sc; + + /* Name used for the adoption. */ + struct xfs_name *xname; + + /* Parent pointer context tracking */ + struct xfs_parent_args ppargs; + + /* Block reservations for orphanage and child (if directory). */ + unsigned int orphanage_blkres; + unsigned int child_blkres; + + /* + * Does the caller want us to bump the child link count? This is not + * needed when reattaching files that have become disconnected but have + * nlink > 1. It is necessary when changing the directory tree + * structure. + */ + bool bump_child_nlink:1; +}; + +bool xrep_orphanage_can_adopt(struct xfs_scrub *sc); + +int xrep_adoption_trans_alloc(struct xfs_scrub *sc, + struct xrep_adoption *adopt); +int xrep_adoption_compute_name(struct xrep_adoption *adopt, + struct xfs_name *xname); +int xrep_adoption_move(struct xrep_adoption *adopt); +int xrep_adoption_trans_roll(struct xrep_adoption *adopt); +#else +struct xrep_adoption { /* empty */ }; +# define xrep_orphanage_rele(sc) ((void)0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_ORPHANAGE_H__ */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 7db873672146..733c410a2279 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -10,19 +10,37 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/repair.h" +#include "scrub/listxattr.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/trace.h" /* Set us up to scrub parents. */ int xchk_setup_parent( struct xfs_scrub *sc) { + int error; + + if (xchk_could_repair(sc)) { + error = xrep_setup_parent(sc); + if (error) + return error; + } + return xchk_setup_inode_contents(sc, 0); } @@ -143,7 +161,8 @@ xchk_parent_validate( } if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) return error; - if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { + if (dp == sc->ip || xrep_is_tempfile(dp) || + !S_ISDIR(VFS_I(dp)->i_mode)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out_rele; } @@ -185,6 +204,621 @@ out_rele: return error; } +/* + * Checking of Parent Pointers + * =========================== + * + * On filesystems with directory parent pointers, we check the referential + * integrity by visiting each parent pointer of a child file and checking that + * the directory referenced by the pointer actually has a dirent pointing + * forward to the child file. + */ + +/* Deferred parent pointer entry that we saved for later. */ +struct xchk_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; +}; + +struct xchk_pptrs { + struct xfs_scrub *sc; + + /* How many parent pointers did we find at the end? */ + unsigned long long pptrs_found; + + /* Parent of this directory. */ + xfs_ino_t parent_ino; + + /* Fixed-size array of xchk_pptr structures. */ + struct xfarray *pptr_entries; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_da_args pptr_args; + + /* If we've cycled the ILOCK, we must revalidate all deferred pptrs. */ + bool need_revalidate; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; +}; + +/* Does this parent pointer match the dotdot entry? */ +STATIC int +xchk_parent_scan_dotdot( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_pptrs *pp = priv; + xfs_ino_t parent_ino; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + if (pp->parent_ino == parent_ino) + return -ECANCELED; + + return 0; +} + +/* Look up the dotdot entry so that we can check it as we walk the pptrs. */ +STATIC int +xchk_parent_pptr_and_dotdot( + struct xchk_pptrs *pp) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + /* Look up '..' */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &pp->parent_ino); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + if (!xfs_verify_dir_ino(sc->mp, pp->parent_ino)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* Is this the root dir? Then '..' must point to itself. */ + if (sc->ip == sc->mp->m_rootip) { + if (sc->ip->i_ino != pp->parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* + * If this is now an unlinked directory, the dotdot value is + * meaningless as long as it points to a valid inode. + */ + if (VFS_I(sc->ip)->i_nlink == 0) + return 0; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Otherwise, walk the pptrs again, and check. */ + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_dotdot, NULL, pp); + if (error == -ECANCELED) { + /* Found a parent pointer that matches dotdot. */ + return 0; + } + if (!error || error == -EFSCORRUPTED) { + /* Found a broken parent pointer or no match. */ + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + return error; +} + +/* + * Try to lock a parent directory for checking dirents. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_parent_lock_dir( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + if (!xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) + return 0; + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) { + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return 0; + } + + if (!xfs_need_iread_extents(&dp->i_df)) + return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED; + + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) { + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return 0; + } + + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; +} + +/* Check the forward link (dirent) associated with this parent pointer. */ +STATIC int +xchk_parent_dirent( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = pp->sc; + xfs_ino_t child_ino; + int error; + + /* + * Use the name attached to this parent pointer to look up the + * directory entry in the alleged parent. + */ + error = xchk_dir_lookup(sc, dp, xname, &child_ino); + if (error == -ENOENT) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + return error; + + /* Does the inode number match? */ + if (child_ino != sc->ip->i_ino) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + + return 0; +} + +/* Try to grab a parent directory. */ +STATIC int +xchk_parent_iget( + struct xchk_pptrs *pp, + const struct xfs_parent_rec *pptr, + struct xfs_inode **dpp) +{ + struct xfs_scrub *sc = pp->sc; + struct xfs_inode *ip; + xfs_ino_t parent_ino = be64_to_cpu(pptr->p_ino); + int error; + + /* Validate inode number. */ + error = xfs_dir_ino_validate(sc->mp, parent_ino); + if (error) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + + error = xchk_iget(sc, parent_ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + return error; + + /* The parent must be a directory. */ + if (!S_ISDIR(VFS_I(ip)->i_mode)) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + goto out_rele; + } + + /* Validate generation number. */ + if (VFS_I(ip)->i_generation != be32_to_cpu(pptr->p_gen)) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + goto out_rele; + } + + *dpp = ip; + return 0; +out_rele: + xchk_irele(sc, ip); + return 0; +} + +/* + * Walk an xattr of a file. If this xattr is a parent pointer, follow it up + * to a parent directory and check that the parent has a dirent pointing back + * to us. + */ +STATIC int +xchk_parent_scan_attr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_pptrs *pp = priv; + struct xfs_inode *dp = NULL; + const struct xfs_parent_rec *pptr_rec = value; + xfs_ino_t parent_ino; + unsigned int lockmode; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return error; + } + + /* No self-referential parent pointers. */ + if (parent_ino == sc->ip->i_ino) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + + pp->pptrs_found++; + + error = xchk_parent_iget(pp, pptr_rec, &dp); + if (error) + return error; + if (!dp) + return 0; + + /* Try to lock the inode. */ + lockmode = xchk_parent_lock_dir(sc, dp); + if (!lockmode) { + struct xchk_pptr save_pp = { + .pptr_rec = *pptr_rec, /* struct copy */ + .namelen = namelen, + }; + + /* Couldn't lock the inode, so save the pptr for later. */ + trace_xchk_parent_defer(sc->ip, &xname, dp->i_ino); + + error = xfblob_storename(pp->pptr_names, &save_pp.name_cookie, + &xname); + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + goto out_rele; + + error = xfarray_append(pp->pptr_entries, &save_pp); + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + goto out_rele; + + goto out_rele; + } + + error = xchk_parent_dirent(pp, &xname, dp); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lockmode); +out_rele: + xchk_irele(sc, dp); + return error; +} + +/* + * Revalidate a parent pointer that we collected in the past but couldn't check + * because of lock contention. Returns 0 if the parent pointer is still valid, + * -ENOENT if it has gone away on us, or a negative errno. + */ +STATIC int +xchk_parent_revalidate_pptr( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_parent_rec *pptr) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + error = xfs_parent_lookup(sc->tp, sc->ip, xname, pptr, &pp->pptr_args); + if (error == -ENOATTR) { + /* Parent pointer went away, nothing to revalidate. */ + return -ENOENT; + } + + return error; +} + +/* + * Check a parent pointer the slow way, which means we cycle locks a bunch + * and put up with revalidation until we get it done. + */ +STATIC int +xchk_parent_slow_pptr( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_parent_rec *pptr) +{ + struct xfs_scrub *sc = pp->sc; + struct xfs_inode *dp = NULL; + unsigned int lockmode; + int error; + + /* Check that the deferred parent pointer still exists. */ + if (pp->need_revalidate) { + error = xchk_parent_revalidate_pptr(pp, xname, pptr); + if (error == -ENOENT) + return 0; + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + return error; + } + + error = xchk_parent_iget(pp, pptr, &dp); + if (error) + return error; + if (!dp) + return 0; + + /* + * If we can grab both IOLOCK and ILOCK of the alleged parent, we + * can proceed with the validation. + */ + lockmode = xchk_parent_lock_dir(sc, dp); + if (lockmode) { + trace_xchk_parent_slowpath(sc->ip, xname, dp->i_ino); + goto check_dirent; + } + + /* + * We couldn't lock the parent dir. Drop all the locks and try to + * get them again, one at a time. + */ + xchk_iunlock(sc, sc->ilock_flags); + pp->need_revalidate = true; + + trace_xchk_parent_ultraslowpath(sc->ip, xname, dp->i_ino); + + error = xchk_dir_trylock_for_pptrs(sc, dp, &lockmode); + if (error) + goto out_rele; + + /* Revalidate the parent pointer now that we cycled locks. */ + error = xchk_parent_revalidate_pptr(pp, xname, pptr); + if (error == -ENOENT) { + error = 0; + goto out_unlock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + goto out_unlock; + +check_dirent: + error = xchk_parent_dirent(pp, xname, dp); +out_unlock: + xfs_iunlock(dp, lockmode); +out_rele: + xchk_irele(sc, dp); + return error; +} + +/* Check all the parent pointers that we deferred the first time around. */ +STATIC int +xchk_parent_finish_slow_pptrs( + struct xchk_pptrs *pp) +{ + xfarray_idx_t array_cur; + int error; + + foreach_xfarray_idx(pp->pptr_entries, array_cur) { + struct xchk_pptr pptr; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + error = xfarray_load(pp->pptr_entries, array_cur, &pptr); + if (error) + return error; + + error = xfblob_loadname(pp->pptr_names, pptr.name_cookie, + &pp->xname, pptr.namelen); + if (error) + return error; + + error = xchk_parent_slow_pptr(pp, &pp->xname, &pptr.pptr_rec); + if (error) + return error; + } + + /* Empty out both xfiles now that we've checked everything. */ + xfarray_truncate(pp->pptr_entries); + xfblob_truncate(pp->pptr_names); + return 0; +} + +/* Count the number of parent pointers. */ +STATIC int +xchk_parent_count_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_pptrs *pp = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + pp->pptrs_found++; + return 0; +} + +/* + * Compare the number of parent pointers to the link count. For + * non-directories these should be the same. For unlinked directories the + * count should be zero; for linked directories, it should be nonzero. + */ +STATIC int +xchk_parent_count_pptrs( + struct xchk_pptrs *pp) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + /* + * If we cycled the ILOCK while cross-checking parent pointers with + * dirents, then we need to recalculate the number of parent pointers. + */ + if (pp->need_revalidate) { + pp->pptrs_found = 0; + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_count_pptr, + NULL, pp); + if (error == -EFSCORRUPTED) { + /* Found a bad parent pointer */ + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + if (error) + return error; + } + + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + if (sc->ip == sc->mp->m_rootip) + pp->pptrs_found++; + + if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + else if (VFS_I(sc->ip)->i_nlink > 0 && + pp->pptrs_found == 0) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } + + return 0; +} + +/* Check parent pointers of a file. */ +STATIC int +xchk_parent_pptr( + struct xfs_scrub *sc) +{ + struct xchk_pptrs *pp; + char *descr; + int error; + + pp = kvzalloc(sizeof(struct xchk_pptrs), XCHK_GFP_FLAGS); + if (!pp) + return -ENOMEM; + pp->sc = sc; + pp->xname.name = pp->namebuf; + + /* + * Set up some staging memory for parent pointers that we can't check + * due to locking contention. + */ + descr = xchk_xfile_ino_descr(sc, "slow parent pointer entries"); + error = xfarray_create(descr, 0, sizeof(struct xchk_pptr), + &pp->pptr_entries); + kfree(descr); + if (error) + goto out_pp; + + descr = xchk_xfile_ino_descr(sc, "slow parent pointer names"); + error = xfblob_create(descr, &pp->pptr_names); + kfree(descr); + if (error) + goto out_entries; + + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_attr, NULL, pp); + if (error == -ECANCELED) { + error = 0; + goto out_names; + } + if (error) + goto out_names; + + error = xchk_parent_finish_slow_pptrs(pp); + if (error == -ETIMEDOUT) { + /* Couldn't grab a lock, scrub was marked incomplete */ + error = 0; + goto out_names; + } + if (error) + goto out_names; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_names; + + /* + * For subdirectories, make sure the dotdot entry references the same + * inode as the parent pointers. + * + * If we're scanning a /consistent/ directory, there should only be + * one parent pointer, and it should point to the same directory as + * the dotdot entry. + * + * However, a corrupt directory tree might feature a subdirectory with + * multiple parents. The directory loop scanner is responsible for + * correcting that kind of problem, so for now we only validate that + * the dotdot entry matches /one/ of the parents. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + error = xchk_parent_pptr_and_dotdot(pp); + if (error) + goto out_names; + } + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_pp; + + /* + * Complain if the number of parent pointers doesn't match the link + * count. This could be a sign of missing parent pointers (or an + * incorrect link count). + */ + error = xchk_parent_count_pptrs(pp); + if (error) + goto out_names; + +out_names: + xfblob_destroy(pp->pptr_names); +out_entries: + xfarray_destroy(pp->pptr_entries); +out_pp: + kvfree(pp); + return error; +} + /* Scrub a parent pointer. */ int xchk_parent( @@ -194,6 +828,9 @@ xchk_parent( xfs_ino_t parent_ino; int error = 0; + if (xfs_has_parent(mp)) + return xchk_parent_pptr(sc); + /* * If we're a directory, check that the '..' link points up to * a directory that has one entry pointing to us. @@ -237,3 +874,64 @@ xchk_parent( return error; } + +/* + * Decide if this file's extended attributes (and therefore its parent + * pointers) have been zapped to satisfy the inode and ifork verifiers. + * Checking and repairing should be postponed until the extended attribute + * structure is fixed. + */ +bool +xchk_pptr_looks_zapped( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_has_parent(mp)); + + /* + * Temporary files that cannot be linked into the directory tree do not + * have attr forks because they cannot ever have parents. + */ + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + return false; + + /* + * Directory tree roots do not have parents, so the expected outcome + * of a parent pointer scan is always the empty set. It's safe to scan + * them even if the attr fork was zapped. + */ + if (ip == mp->m_rootip) + return false; + + /* + * Metadata inodes are all rooted in the superblock and do not have + * any parents. Hence the attr fork will not be initialized, but + * there are no parent pointers that might have been zapped. + */ + if (xfs_is_metadata_inode(ip)) + return false; + + /* + * Linked and linkable non-rootdir files should always have an + * attribute fork because that is where parent pointers are + * stored. If the fork is absent, something is amiss. + */ + if (!xfs_inode_has_attr_fork(ip)) + return true; + + /* Repair zapped this file's attr fork a short time ago */ + if (xfs_ifork_zapped(ip, XFS_ATTR_FORK)) + return true; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the bmapbta + * scrubber to reconstruct the block mappings. The extended attribute + * structure always contain some content when parent pointers are + * enabled, so this is a clear sign of a zapped attr fork. + */ + return ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0; +} diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c new file mode 100644 index 000000000000..7b42b7f65a0b --- /dev/null +++ b/fs/xfs/scrub/parent_repair.c @@ -0,0 +1,1612 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_trans_space.h" +#include "xfs_health.h" +#include "xfs_exchmaps.h" +#include "xfs_parent.h" +#include "xfs_attr.h" +#include "xfs_bmap.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/findparent.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/orphanage.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/attr_repair.h" +#include "scrub/listxattr.h" + +/* + * Repairing The Directory Parent Pointer + * ====================================== + * + * Currently, only directories support parent pointers (in the form of '..' + * entries), so we simply scan the filesystem and update the '..' entry. + * + * Note that because the only parent pointer is the dotdot entry, we won't + * touch an unhealthy directory, since the directory repair code is perfectly + * capable of rebuilding a directory with the proper parent inode. + * + * See the section on locking issues in dir_repair.c for more information about + * conflicts with the VFS. The findparent code wll keep our incore parent + * inode up to date. + * + * If parent pointers are enabled, we instead reconstruct the parent pointer + * information by visiting every directory entry of every directory in the + * system and translating the relevant dirents into parent pointers. In this + * case, it is advantageous to stash all parent pointers created from dirents + * from a single parent file before replaying them into the temporary file. To + * save memory, the live filesystem scan reuses the findparent object. Parent + * pointer repair chooses either directory scanning or findparent, but not + * both. + * + * When salvaging completes, the remaining stashed entries are replayed to the + * temporary file. All non-parent pointer extended attributes are copied to + * the temporary file's extended attributes. An atomic file mapping exchange + * is used to commit the new xattr blocks to the file being repaired. This + * will disrupt attrmulti cursors. + */ + +/* Create a parent pointer in the tempfile. */ +#define XREP_PPTR_ADD (1) + +/* Remove a parent pointer from the tempfile. */ +#define XREP_PPTR_REMOVE (2) + +/* A stashed parent pointer update. */ +struct xrep_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; + + /* XREP_PPTR_{ADD,REMOVE} */ + uint8_t action; +}; + +/* + * Stash up to 8 pages of recovered parent pointers in pptr_recs and + * pptr_names before we write them to the temp file. + */ +#define XREP_PARENT_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_parent { + struct xfs_scrub *sc; + + /* Fixed-size array of xrep_pptr structures. */ + struct xfarray *pptr_recs; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* xattr keys */ + struct xfarray *xattr_records; + + /* xattr values */ + struct xfblob *xattr_blobs; + + /* Scratch buffers for saving extended attributes */ + unsigned char *xattr_name; + void *xattr_value; + unsigned int xattr_value_sz; + + /* + * Information used to exchange the attr fork mappings, if the fs + * supports parent pointers. + */ + struct xrep_tempexch tx; + + /* + * Information used to scan the filesystem to find the inumber of the + * dotdot entry for this directory. On filesystems without parent + * pointers, we use the findparent_* functions on this object and + * access only the parent_ino field directly. + * + * When parent pointers are enabled, the directory entry scanner uses + * the iscan, hooks, and lock fields of this object directly. + * @pscan.lock coordinates access to pptr_recs, pptr_names, pptr, and + * pptr_scratch. This reduces the memory requirements of this + * structure. + * + * The lock also controls access to xattr_records and xattr_blobs(?) + */ + struct xrep_parent_scan_info pscan; + + /* Orphanage reparenting request. */ + struct xrep_adoption adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + unsigned char namebuf[MAXNAMELEN]; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_da_args pptr_args; + + /* Have we seen any live updates of parent pointers recently? */ + bool saw_pptr_updates; + + /* Number of parents we found after all other repairs */ + unsigned long long parents; +}; + +struct xrep_parent_xattr { + /* Cookie for retrieval of the xattr name. */ + xfblob_cookie name_cookie; + + /* Cookie for retrieval of the xattr value. */ + xfblob_cookie value_cookie; + + /* XFS_ATTR_* flags */ + int flags; + + /* Length of the value and name. */ + uint32_t valuelen; + uint16_t namelen; +}; + +/* + * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write + * them to the temp file. + */ +#define XREP_PARENT_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +/* Tear down all the incore stuff we created. */ +static void +xrep_parent_teardown( + struct xrep_parent *rp) +{ + xrep_findparent_scan_teardown(&rp->pscan); + kvfree(rp->xattr_name); + rp->xattr_name = NULL; + kvfree(rp->xattr_value); + rp->xattr_value = NULL; + if (rp->xattr_blobs) + xfblob_destroy(rp->xattr_blobs); + rp->xattr_blobs = NULL; + if (rp->xattr_records) + xfarray_destroy(rp->xattr_records); + rp->xattr_records = NULL; + if (rp->pptr_names) + xfblob_destroy(rp->pptr_names); + rp->pptr_names = NULL; + if (rp->pptr_recs) + xfarray_destroy(rp->pptr_recs); + rp->pptr_recs = NULL; +} + +/* Set up for a parent repair. */ +int +xrep_setup_parent( + struct xfs_scrub *sc) +{ + struct xrep_parent *rp; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + rp = kvzalloc(sizeof(struct xrep_parent), XCHK_GFP_FLAGS); + if (!rp) + return -ENOMEM; + rp->sc = sc; + rp->xname.name = rp->namebuf; + sc->buf = rp; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + return xrep_orphanage_try_create(sc); +} + +/* + * Scan all files in the filesystem for a child dirent that we can turn into + * the dotdot entry for this directory. + */ +STATIC int +xrep_parent_find_dotdot( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t ino; + unsigned int sick, checked; + int error; + + /* + * Avoid sick directories. There shouldn't be anyone else clearing the + * directory's sick status. + */ + xfs_inode_measure_sickness(sc->ip, &sick, &checked); + if (sick & XFS_SICK_INO_DIR) + return -EFSCORRUPTED; + + ino = xrep_findparent_self_reference(sc); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rp->pscan, ino); + return 0; + } + + /* + * Drop the ILOCK on this directory so that we can scan for the dotdot + * entry. Figure out who is going to be the parent of this directory, + * then retake the ILOCK so that we can salvage directory entries. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Does the VFS dcache have an answer for us? */ + ino = xrep_findparent_from_dcache(sc); + if (ino != NULLFSINO) { + error = xrep_findparent_confirm(sc, &ino); + if (!error && ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rp->pscan, ino); + goto out_relock; + } + } + + /* Scan the entire filesystem for a parent. */ + error = xrep_findparent_scan(&rp->pscan); +out_relock: + xchk_ilock(sc, XFS_ILOCK_EXCL); + + return error; +} + +/* + * Add this stashed incore parent pointer to the temporary file. + * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and + * must not be in transaction context. + */ +STATIC int +xrep_parent_replay_update( + struct xrep_parent *rp, + const struct xfs_name *xname, + struct xrep_pptr *pptr) +{ + struct xfs_scrub *sc = rp->sc; + + switch (pptr->action) { + case XREP_PPTR_ADD: + /* Create parent pointer. */ + trace_xrep_parent_replay_parentadd(sc->tempip, xname, + &pptr->pptr_rec); + + return xfs_parent_set(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rp->pptr_args); + case XREP_PPTR_REMOVE: + /* Remove parent pointer. */ + trace_xrep_parent_replay_parentremove(sc->tempip, xname, + &pptr->pptr_rec); + + return xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rp->pptr_args); + } + + ASSERT(0); + return -EIO; +} + +/* + * Flush stashed parent pointer updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the parent pointer + * rebuild, since files can have a lot of hardlinks and the fs can be busy. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile + * IOLOCK. + */ +STATIC int +xrep_parent_replay_updates( + struct xrep_parent *rp) +{ + xfarray_idx_t array_cur; + int error; + + mutex_lock(&rp->pscan.lock); + foreach_xfarray_idx(rp->pptr_recs, array_cur) { + struct xrep_pptr pptr; + + error = xfarray_load(rp->pptr_recs, array_cur, &pptr); + if (error) + goto out_unlock; + + error = xfblob_loadname(rp->pptr_names, pptr.name_cookie, + &rp->xname, pptr.namelen); + if (error) + goto out_unlock; + rp->xname.len = pptr.namelen; + mutex_unlock(&rp->pscan.lock); + + error = xrep_parent_replay_update(rp, &rp->xname, &pptr); + if (error) + return error; + + mutex_lock(&rp->pscan.lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rp->pptr_recs); + xfblob_truncate(rp->pptr_names); + mutex_unlock(&rp->pscan.lock); + return 0; +out_unlock: + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* + * Remember that we want to create a parent pointer in the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_parent_stash_parentadd( + struct xrep_parent *rp, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_pptr pptr = { + .action = XREP_PPTR_ADD, + .namelen = name->len, + }; + int error; + + trace_xrep_parent_stash_parentadd(rp->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rp->pptr_recs, &pptr); +} + +/* + * Remember that we want to remove a parent pointer from the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_parent_stash_parentremove( + struct xrep_parent *rp, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_pptr pptr = { + .action = XREP_PPTR_REMOVE, + .namelen = name->len, + }; + int error; + + trace_xrep_parent_stash_parentremove(rp->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rp->pptr_recs, &pptr); +} + +/* + * Examine an entry of a directory. If this dirent leads us back to the file + * whose parent pointers we're rebuilding, add a pptr to the temporary + * directory. + */ +STATIC int +xrep_parent_scan_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + /* Dirent doesn't point to this directory. */ + if (ino != rp->sc->ip->i_ino) + return 0; + + /* No weird looking names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* No mismatching ftypes. */ + if (name->type != xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode)) + return -EFSCORRUPTED; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + /* + * Transform this dirent into a parent pointer and queue it for later + * addition to the temporary file. + */ + mutex_lock(&rp->pscan.lock); + error = xrep_parent_stash_parentadd(rp, name, dp); + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* + * Decide if we want to look for dirents in this directory. Skip the file + * being repaired and any files being used to stage repairs. + */ +static inline bool +xrep_parent_want_scan( + struct xrep_parent *rp, + const struct xfs_inode *ip) +{ + return ip != rp->sc->ip && !xrep_is_tempfile(ip); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt. + * Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_parent_scan_ilock( + struct xrep_parent *rp, + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* Still need to take the shared ILOCK to advance the iscan cursor. */ + if (!xrep_parent_want_scan(rp, ip)) + goto lock; + + if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * Scan this file for relevant child dirents that point to the file whose + * parent pointers we're rebuilding. + */ +STATIC int +xrep_parent_scan_file( + struct xrep_parent *rp, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + lock_mode = xrep_parent_scan_ilock(rp, ip); + + if (!xrep_parent_want_scan(rp, ip)) + goto scan_done; + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + /* + * If the directory looks as though it has been zapped by the + * inode record repair code, we cannot scan for child dirents. + */ + if (xchk_dir_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_dir_walk(rp->sc, ip, xrep_parent_scan_dirent, rp); + if (error) + goto scan_done; + } + +scan_done: + xchk_iscan_mark_visited(&rp->pscan.iscan, ip); + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Decide if we've stashed too much pptr data in memory. */ +static inline bool +xrep_parent_want_flush_stashed( + struct xrep_parent *rp) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rp->pptr_recs) + xfblob_bytes(rp->pptr_names); + return bytes > XREP_PARENT_MAX_STASH_BYTES; +} + +/* + * Scan all directories in the filesystem to look for dirents that we can turn + * into parent pointers. + */ +STATIC int +xrep_parent_scan_dirtree( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_inode *ip; + int error; + + /* + * Filesystem scans are time consuming. Drop the file ILOCK and all + * other resources for the duration of the scan and hope for the best. + * The live update hooks will keep our scan information up to date. + */ + xchk_trans_cancel(sc); + if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) + xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | + XFS_ILOCK_EXCL)); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) { + bool flush; + + error = xrep_parent_scan_file(rp, ip); + xchk_irele(sc, ip); + if (error) + break; + + /* Flush stashed pptr updates to constrain memory usage. */ + mutex_lock(&rp->pscan.lock); + flush = xrep_parent_want_flush_stashed(rp); + mutex_unlock(&rp->pscan.lock); + if (flush) { + xchk_trans_cancel(sc); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + break; + + error = xrep_parent_replay_updates(rp); + xrep_tempfile_iounlock(sc); + if (error) + break; + + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rp->pscan.iscan); + if (error) { + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Retake sc->ip's ILOCK now that we're done flushing stashed parent + * pointers. We end this function with an empty transaction and the + * ILOCK. + */ + xchk_ilock(rp->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Capture dirent updates being made by other threads which are relevant to the + * file being repaired. + */ +STATIC int +xrep_parent_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_parent *rp; + struct xfs_scrub *sc; + int error; + + rp = container_of(nb, struct xrep_parent, pscan.dhook.dirent_hook.nb); + sc = rp->sc; + + /* + * This thread updated a dirent that points to the file that we're + * repairing, so stash the update for replay against the temporary + * file. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rp->pscan.iscan, p->dp->i_ino)) { + mutex_lock(&rp->pscan.lock); + if (p->delta > 0) + error = xrep_parent_stash_parentadd(rp, p->name, p->dp); + else + error = xrep_parent_stash_parentremove(rp, p->name, + p->dp); + if (!error) + rp->saw_pptr_updates = true; + mutex_unlock(&rp->pscan.lock); + if (error) + goto out_abort; + } + + return NOTIFY_DONE; +out_abort: + xchk_iscan_abort(&rp->pscan.iscan); + return NOTIFY_DONE; +} + +/* Reset a directory's dotdot entry, if needed. */ +STATIC int +xrep_parent_reset_dotdot( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t ino; + unsigned int spaceres; + int error = 0; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &ino); + if (error || ino == rp->pscan.parent_ino) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + trace_xrep_parent_reset_dotdot(sc->ip, rp->pscan.parent_ino); + + /* + * Reserve more space just in case we have to expand the dir. We're + * allowed to exceed quota to repair inconsistent metadata. + */ + spaceres = xfs_rename_space_res(sc->mp, 0, false, xfs_name_dotdot.len, + false); + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, spaceres, 0, + true); + if (error) + return error; + + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + rp->pscan.parent_ino, spaceres); + if (error) + return error; + + /* + * Roll transaction to detach the inode from the transaction but retain + * ILOCK_EXCL. + */ + return xfs_trans_roll(&sc->tp); +} + +/* Pass back the parent inumber if this a parent pointer */ +STATIC int +xrep_parent_lookup_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + xfs_ino_t *inop = priv; + xfs_ino_t parent_ino; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + *inop = parent_ino; + return -ECANCELED; +} + +/* + * Find the first parent of the scrub target by walking parent pointers for + * the purpose of deciding if we're going to move it to the orphanage. + * We don't care if the attr fork is zapped. + */ +STATIC int +xrep_parent_lookup_pptrs( + struct xfs_scrub *sc, + xfs_ino_t *inop) +{ + int error; + + *inop = NULLFSINO; + + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_lookup_pptr, NULL, + inop); + if (error && error != -ECANCELED) + return error; + return 0; +} + +/* + * Move the current file to the orphanage. + * + * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon + * successful return, the scrub transaction will have enough extra reservation + * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the + * orphanage; and both inodes will be ijoined. + */ +STATIC int +xrep_parent_move_to_orphanage( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t orig_parent, new_parent; + int error; + + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + /* + * We are about to drop the ILOCK on sc->ip to lock the + * orphanage and prepare for the adoption. Therefore, look up + * the old dotdot entry for sc->ip so that we can compare it + * after we re-lock sc->ip. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &orig_parent); + if (error) + return error; + } else { + /* + * We haven't dropped the ILOCK since we committed the new + * xattr structure (and hence the new parent pointer records), + * which means that the file cannot have been moved in the + * directory tree, and there are no parents. + */ + orig_parent = NULLFSINO; + } + + /* + * Drop the ILOCK on the scrub target and commit the transaction. + * Adoption computes its own resource requirements and gathers the + * necessary components. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* If we can take the orphanage's iolock then we're ready to move. */ + if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + xchk_iunlock(sc, sc->ilock_flags); + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; + } + + /* Grab transaction and ILOCK the two files. */ + error = xrep_adoption_trans_alloc(sc, &rp->adoption); + if (error) + return error; + + error = xrep_adoption_compute_name(&rp->adoption, &rp->xname); + if (error) + return error; + + /* + * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot + * entry again. If the parent changed or the child was unlinked while + * the child directory was unlocked, we don't need to move the child to + * the orphanage after all. For a non-directory, we have to scan for + * the first parent pointer to see if one has been added. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &new_parent); + else + error = xrep_parent_lookup_pptrs(sc, &new_parent); + if (error) + return error; + + /* + * Attach to the orphanage if we still have a linked directory and it + * hasn't been moved. + */ + if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { + error = xrep_adoption_move(&rp->adoption); + if (error) + return error; + } + + /* + * Launder the scrub transaction so we can drop the orphanage ILOCK + * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. + */ + error = xrep_adoption_trans_roll(&rp->adoption); + if (error) + return error; + + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* Ensure that the xattr value buffer is large enough. */ +STATIC int +xrep_parent_alloc_xattr_value( + struct xrep_parent *rp, + size_t bufsize) +{ + void *new_val; + + if (rp->xattr_value_sz >= bufsize) + return 0; + + if (rp->xattr_value) { + kvfree(rp->xattr_value); + rp->xattr_value = NULL; + rp->xattr_value_sz = 0; + } + + new_val = kvmalloc(bufsize, XCHK_GFP_FLAGS); + if (!new_val) + return -ENOMEM; + + rp->xattr_value = new_val; + rp->xattr_value_sz = bufsize; + return 0; +} + +/* Retrieve the (remote) value of a non-pptr xattr. */ +STATIC int +xrep_parent_fetch_xattr_remote( + struct xrep_parent *rp, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + unsigned int valuelen) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_da_args args = { + .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .dp = ip, + .name = name, + .namelen = namelen, + .trans = sc->tp, + .valuelen = valuelen, + .owner = ip->i_ino, + }; + int error; + + /* + * If we need a larger value buffer, try to allocate one. If that + * fails, return with -EDEADLOCK to try harder. + */ + error = xrep_parent_alloc_xattr_value(rp, valuelen); + if (error == -ENOMEM) + return -EDEADLOCK; + if (error) + return error; + + args.value = rp->xattr_value; + xfs_attr_sethash(&args); + return xfs_attr_get_ilocked(&args); +} + +/* Stash non-pptr attributes for later replay into the temporary file. */ +STATIC int +xrep_parent_stash_xattr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xrep_parent_xattr key = { + .valuelen = valuelen, + .namelen = namelen, + .flags = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + }; + struct xrep_parent *rp = priv; + int error; + + if (attr_flags & (XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT)) + return 0; + + if (!value) { + error = xrep_parent_fetch_xattr_remote(rp, ip, attr_flags, + name, namelen, valuelen); + if (error) + return error; + + value = rp->xattr_value; + } + + trace_xrep_parent_stash_xattr(rp->sc->tempip, key.flags, (void *)name, + key.namelen, key.valuelen); + + error = xfblob_store(rp->xattr_blobs, &key.name_cookie, name, + key.namelen); + if (error) + return error; + + error = xfblob_store(rp->xattr_blobs, &key.value_cookie, value, + key.valuelen); + if (error) + return error; + + return xfarray_append(rp->xattr_records, &key); +} + +/* Insert one xattr key/value. */ +STATIC int +xrep_parent_insert_xattr( + struct xrep_parent *rp, + const struct xrep_parent_xattr *key) +{ + struct xfs_da_args args = { + .dp = rp->sc->tempip, + .attr_filter = key->flags, + .namelen = key->namelen, + .valuelen = key->valuelen, + .owner = rp->sc->ip->i_ino, + .geo = rp->sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + int error; + + ASSERT(!(key->flags & XFS_ATTR_PARENT)); + + /* + * Grab pointers to the scrub buffer so that we can use them to insert + * attrs into the temp file. + */ + args.name = rp->xattr_name; + args.value = rp->xattr_value; + + /* + * The attribute name is stored near the end of the in-core buffer, + * though we reserve one more byte to ensure null termination. + */ + rp->xattr_name[XATTR_NAME_MAX] = 0; + + error = xfblob_load(rp->xattr_blobs, key->name_cookie, rp->xattr_name, + key->namelen); + if (error) + return error; + + error = xfblob_free(rp->xattr_blobs, key->name_cookie); + if (error) + return error; + + error = xfblob_load(rp->xattr_blobs, key->value_cookie, args.value, + key->valuelen); + if (error) + return error; + + error = xfblob_free(rp->xattr_blobs, key->value_cookie); + if (error) + return error; + + rp->xattr_name[key->namelen] = 0; + + trace_xrep_parent_insert_xattr(rp->sc->tempip, key->flags, + rp->xattr_name, key->namelen, key->valuelen); + + xfs_attr_sethash(&args); + return xfs_attr_set(&args, XFS_ATTRUPDATE_UPSERT, false); +} + +/* + * Periodically flush salvaged attributes to the temporary file. This is done + * to reduce the memory requirements of the xattr rebuild because files can + * contain millions of attributes. + */ +STATIC int +xrep_parent_flush_xattrs( + struct xrep_parent *rp) +{ + xfarray_idx_t array_cur; + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and the empty scrub + * transaction that we created for the xattr scan. We hold ILOCK_EXCL + * on the inode being repaired. + * + * To constrain kernel memory use, we occasionally flush salvaged + * xattrs from the xfarray and xfblob structures into the temporary + * file in preparation for exchanging the xattr structures at the end. + * Updating the temporary file requires a transaction, so we commit the + * scrub transaction and drop the ILOCK so that xfs_attr_set can + * allocate whatever transaction it wants. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from adding xattrs (or parent pointers) while we're + * flushing. + */ + xchk_trans_cancel(rp->sc); + xchk_iunlock(rp->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify xattrs. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rp->sc); + if (error) + return error; + + /* Add all the salvaged attrs to the temporary file. */ + foreach_xfarray_idx(rp->xattr_records, array_cur) { + struct xrep_parent_xattr key; + + error = xfarray_load(rp->xattr_records, array_cur, &key); + if (error) + return error; + + error = xrep_parent_insert_xattr(rp, &key); + if (error) + return error; + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rp->xattr_records); + xfblob_truncate(rp->xattr_blobs); + + xrep_tempfile_iounlock(rp->sc); + + /* Recreate the empty transaction and relock the inode. */ + error = xchk_trans_alloc_empty(rp->sc); + if (error) + return error; + xchk_ilock(rp->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much xattr data in memory. */ +static inline bool +xrep_parent_want_flush_xattrs( + struct xrep_parent *rp) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rp->xattr_records) + + xfblob_bytes(rp->xattr_blobs); + return bytes > XREP_PARENT_XATTR_MAX_STASH_BYTES; +} + +/* Flush staged attributes to the temporary file if we're over the limit. */ +STATIC int +xrep_parent_try_flush_xattrs( + struct xfs_scrub *sc, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + if (!xrep_parent_want_flush_xattrs(rp)) + return 0; + + error = xrep_parent_flush_xattrs(rp); + if (error) + return error; + + /* + * If there were any parent pointer updates to the xattr structure + * while we dropped the ILOCK, the xattr structure is now stale. + * Signal to the attr copy process that we need to start over, but + * this time without opportunistic attr flushing. + * + * This is unlikely to happen, so we're ok with restarting the copy. + */ + mutex_lock(&rp->pscan.lock); + if (rp->saw_pptr_updates) + error = -ESTALE; + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* Copy all the non-pptr extended attributes into the temporary file. */ +STATIC int +xrep_parent_copy_xattrs( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + /* + * Clear the pptr updates flag. We hold sc->ip ILOCKed, so there + * can't be any parent pointer updates in progress. + */ + mutex_lock(&rp->pscan.lock); + rp->saw_pptr_updates = false; + mutex_unlock(&rp->pscan.lock); + + /* Copy xattrs, stopping periodically to flush the incore buffers. */ + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr, + xrep_parent_try_flush_xattrs, rp); + if (error && error != -ESTALE) + return error; + + if (error == -ESTALE) { + /* + * The xattr copy collided with a parent pointer update. + * Restart the copy, but this time hold the ILOCK all the way + * to the end to lock out any directory parent pointer updates. + */ + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr, + NULL, rp); + if (error) + return error; + } + + /* Flush any remaining stashed xattrs to the temporary file. */ + if (xfarray_bytes(rp->xattr_records) == 0) + return 0; + + return xrep_parent_flush_xattrs(rp); +} + +/* + * Ensure that @sc->ip and @sc->tempip both have attribute forks before we head + * into the attr fork exchange transaction. All files on a filesystem with + * parent pointers must have an attr fork because the parent pointer code does + * not itself add attribute forks. + * + * Note: Unlinkable unlinked files don't need one, but the overhead of having + * an unnecessary attr fork is not justified by the additional code complexity + * that would be needed to track that state correctly. + */ +STATIC int +xrep_parent_ensure_attr_fork( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + error = xfs_attr_add_fork(sc->tempip, + sizeof(struct xfs_attr_sf_hdr), 1); + if (error) + return error; + return xfs_attr_add_fork(sc->ip, sizeof(struct xfs_attr_sf_hdr), 1); +} + +/* + * Finish replaying stashed parent pointer updates, allocate a transaction for + * exchanging extent mappings, and take the ILOCKs of both files before we + * commit the new attribute structure. + */ +STATIC int +xrep_parent_finalize_tempfile( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + /* + * Repair relies on the ILOCK to quiesce all possible xattr updates. + * Replay all queued parent pointer updates into the tempfile before + * exchanging the contents, even if that means dropping the ILOCKs and + * the transaction. + */ + do { + error = xrep_parent_replay_updates(rp); + if (error) + return error; + + error = xrep_parent_ensure_attr_fork(rp); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rp->tx); + if (error) + return error; + + if (xfarray_length(rp->pptr_recs) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* + * Replay all the stashed parent pointers into the temporary file, copy all + * the non-pptr xattrs from the file being repaired into the temporary file, + * and exchange the attr fork contents atomically. + */ +STATIC int +xrep_parent_rebuild_pptrs( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t parent_ino = NULLFSINO; + int error; + + /* + * Copy non-ppttr xattrs from the file being repaired into the + * temporary file's xattr structure. We hold sc->ip's IOLOCK, which + * prevents setxattr/removexattr calls from occurring, but renames + * update the parent pointers without holding IOLOCK. If we detect + * stale attr structures, we restart the scan but only flush at the + * end. + */ + error = xrep_parent_copy_xattrs(rp); + if (error) + return error; + + /* + * Cancel the empty transaction that we used to walk and copy attrs, + * and drop the ILOCK so that we can take the IOLOCK on the temporary + * file. We still hold sc->ip's IOLOCK. + */ + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed pptr updates to the tempdir. After this point, + * we're ready to exchange the attr fork mappings. + */ + error = xrep_parent_finalize_tempfile(rp); + if (error) + return error; + + /* Last chance to abort before we start committing pptr fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + if (xchk_iscan_aborted(&rp->pscan.iscan)) + return -ECANCELED; + + /* + * Exchange the attr fork contents and junk the old attr fork contents, + * which are now in the tempfile. + */ + error = xrep_xattr_swap(sc, &rp->tx); + if (error) + return error; + error = xrep_xattr_reset_tempfile_fork(sc); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target file. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + + /* + * We've committed the new parent pointers. Find at least one parent + * so that we can decide if we're moving this file to the orphanage. + * For this purpose, root directories are their own parents. + */ + if (sc->ip == sc->mp->m_rootip) { + xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino); + } else { + error = xrep_parent_lookup_pptrs(sc, &parent_ino); + if (error) + return error; + if (parent_ino != NULLFSINO) + xrep_findparent_scan_found(&rp->pscan, parent_ino); + } + return 0; +} + +/* + * Commit the new parent pointer structure (currently only the dotdot entry) to + * the file that we're repairing. + */ +STATIC int +xrep_parent_rebuild_tree( + struct xrep_parent *rp) +{ + int error; + + if (xfs_has_parent(rp->sc->mp)) { + error = xrep_parent_rebuild_pptrs(rp); + if (error) + return error; + } + + if (rp->pscan.parent_ino == NULLFSINO) { + if (xrep_orphanage_can_adopt(rp->sc)) + return xrep_parent_move_to_orphanage(rp); + return -EFSCORRUPTED; + } + + if (S_ISDIR(VFS_I(rp->sc->ip)->i_mode)) + return xrep_parent_reset_dotdot(rp); + + return 0; +} + +/* Count the number of parent pointers. */ +STATIC int +xrep_parent_count_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + rp->parents++; + return 0; +} + +/* + * After all parent pointer rebuilding and adoption activity completes, reset + * the link count of this nondirectory, having scanned the fs to rebuild all + * parent pointers. + */ +STATIC int +xrep_parent_set_nondir_nlink( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_inode *ip = sc->ip; + struct xfs_perag *pag; + bool joined = false; + int error; + + /* Count parent pointers so we can reset the file link count. */ + rp->parents = 0; + error = xchk_xattr_walk(sc, ip, xrep_parent_count_pptr, NULL, rp); + if (error) + return error; + + if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + + /* + * The file is on the unlinked list but we found parents. + * Remove the file from the unlinked list. + */ + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, ip->i_ino)); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_remove(sc->tp, pag, ip); + xfs_perag_put(pag); + if (error) + return error; + } else if (rp->parents == 0 && !xfs_inode_on_unlinked_list(ip)) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + + /* + * The file is not on the unlinked list but we found no + * parents. Add the file to the unlinked list. + */ + error = xfs_iunlink(sc->tp, ip); + if (error) + return error; + } + + /* Set the correct link count. */ + if (VFS_I(ip)->i_nlink != rp->parents) { + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + set_nlink(VFS_I(ip), min_t(unsigned long long, rp->parents, + XFS_NLINK_PINNED)); + } + + /* Log the inode to keep it moving forward if we dirtied anything. */ + if (joined) + xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); + return 0; +} + +/* Set up the filesystem scan so we can look for parents. */ +STATIC int +xrep_parent_setup_scan( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + char *descr; + struct xfs_da_geometry *geo = sc->mp->m_attr_geo; + int max_len; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_findparent_scan_start(sc, &rp->pscan); + + /* Buffers for copying non-pptr attrs to the tempfile */ + rp->xattr_name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS); + if (!rp->xattr_name) + return -ENOMEM; + + /* + * Allocate enough memory to handle loading local attr values from the + * xfblob data while flushing stashed attrs to the temporary file. + * We only realloc the buffer when salvaging remote attr values, so + * TRY_HARDER means we allocate the maximal attr value size. + */ + if (sc->flags & XCHK_TRY_HARDER) + max_len = XATTR_SIZE_MAX; + else + max_len = xfs_attr_leaf_entsize_local_max(geo->blksize); + error = xrep_parent_alloc_xattr_value(rp, max_len); + if (error) + goto out_xattr_name; + + /* Set up some staging memory for logging parent pointer updates. */ + descr = xchk_xfile_ino_descr(sc, "parent pointer entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_pptr), + &rp->pptr_recs); + kfree(descr); + if (error) + goto out_xattr_value; + + descr = xchk_xfile_ino_descr(sc, "parent pointer names"); + error = xfblob_create(descr, &rp->pptr_names); + kfree(descr); + if (error) + goto out_recs; + + /* Set up some storage for copying attrs before the mapping exchange */ + descr = xchk_xfile_ino_descr(sc, + "parent pointer retained xattr entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_parent_xattr), + &rp->xattr_records); + kfree(descr); + if (error) + goto out_names; + + descr = xchk_xfile_ino_descr(sc, + "parent pointer retained xattr values"); + error = xfblob_create(descr, &rp->xattr_blobs); + kfree(descr); + if (error) + goto out_attr_keys; + + error = __xrep_findparent_scan_start(sc, &rp->pscan, + xrep_parent_live_update); + if (error) + goto out_attr_values; + + return 0; + +out_attr_values: + xfblob_destroy(rp->xattr_blobs); + rp->xattr_blobs = NULL; +out_attr_keys: + xfarray_destroy(rp->xattr_records); + rp->xattr_records = NULL; +out_names: + xfblob_destroy(rp->pptr_names); + rp->pptr_names = NULL; +out_recs: + xfarray_destroy(rp->pptr_recs); + rp->pptr_recs = NULL; +out_xattr_value: + kvfree(rp->xattr_value); + rp->xattr_value = NULL; +out_xattr_name: + kvfree(rp->xattr_name); + rp->xattr_name = NULL; + return error; +} + +int +xrep_parent( + struct xfs_scrub *sc) +{ + struct xrep_parent *rp = sc->buf; + int error; + + /* + * When the parent pointers feature is enabled, repairs are committed + * by atomically committing a new xattr structure and reaping the old + * attr fork. Reaping requires rmap and exchange-range to be enabled. + */ + if (xfs_has_parent(sc->mp)) { + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + } + + error = xrep_parent_setup_scan(rp); + if (error) + return error; + + if (xfs_has_parent(sc->mp)) + error = xrep_parent_scan_dirtree(rp); + else + error = xrep_parent_find_dotdot(rp); + if (error) + goto out_teardown; + + /* Last chance to abort before we start committing dotdot fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_teardown; + + error = xrep_parent_rebuild_tree(rp); + if (error) + goto out_teardown; + if (xfs_has_parent(sc->mp) && !S_ISDIR(VFS_I(sc->ip)->i_mode)) { + error = xrep_parent_set_nondir_nlink(rp); + if (error) + goto out_teardown; + } + + error = xrep_defer_finish(sc); + +out_teardown: + xrep_parent_teardown(rp); + return error; +} diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index 0bab4c30cb85..90cd1512bba9 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -77,8 +77,6 @@ xrep_quota_item_fill_bmap_hole( irec, &nmaps); if (error) return error; - if (nmaps != 1) - return -ENOSPC; dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock); @@ -444,10 +442,6 @@ xrep_quota_data_fork( XFS_BMAPI_CONVERT, 0, &nrec, &nmap); if (error) goto out; - if (nmap != 1) { - error = -ENOSPC; - goto out; - } ASSERT(nrec.br_startoff == irec.br_startoff); ASSERT(nrec.br_blockcount == irec.br_blockcount); diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index dfdcb96b6c16..01c9a2dc0f2c 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -18,6 +18,7 @@ #include "xfs_trans.h" #include "xfs_error.h" #include "scrub/scrub.h" +#include "scrub/common.h" #include "scrub/readdir.h" /* Call a function for every entry in a shortform directory. */ @@ -99,7 +100,7 @@ xchk_dir_walk_block( unsigned int off, next_off, end; int error; - error = xfs_dir3_block_read(sc->tp, dp, &bp); + error = xfs_dir3_block_read(sc->tp, dp, dp->i_ino, &bp); if (error) return error; @@ -175,7 +176,7 @@ xchk_read_leaf_dir_buf( if (new_off > *curoff) *curoff = new_off; - return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp); + return xfs_dir3_data_read(tp, dp, dp->i_ino, map.br_startoff, 0, bpp); } /* Call a function for every entry in a leaf directory. */ @@ -273,8 +274,8 @@ xchk_dir_walk( .dp = dp, .geo = dp->i_mount->m_dir_geo, .trans = sc->tp, + .owner = dp->i_ino, }; - bool isblock; int error; if (xfs_is_shutdown(dp->i_mount)) @@ -283,22 +284,17 @@ xchk_dir_walk( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + switch (xfs_dir2_format(&args, &error)) { + case XFS_DIR2_FMT_SF: return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); - - /* dir2 functions require that the data fork is loaded */ - error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); - if (error) - return error; - - error = xfs_dir2_isblock(&args, &isblock); - if (error) - return error; - - if (isblock) + case XFS_DIR2_FMT_BLOCK: return xchk_dir_walk_block(sc, dp, dirent_fn, priv); - - return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); + case XFS_DIR2_FMT_LEAF: + case XFS_DIR2_FMT_NODE: + return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); + default: + return error; + } } /* @@ -324,50 +320,102 @@ xchk_dir_lookup( .hashval = xfs_dir2_hashname(dp->i_mount, name), .whichfork = XFS_DATA_FORK, .op_flags = XFS_DA_OP_OKNOENT, + .owner = dp->i_ino, }; - bool isblock, isleaf; int error; if (xfs_is_shutdown(dp->i_mount)) return -EIO; + /* + * A temporary directory's block headers are written with the owner + * set to sc->ip, so we must switch the owner here for the lookup. + */ + if (dp == sc->tempip) + args.owner = sc->ip->i_ino; + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - error = xfs_dir2_sf_lookup(&args); - goto out_check_rval; - } + error = xfs_dir_lookup_args(&args); + if (!error) + *ino = args.inumber; + return error; +} - /* dir2 functions require that the data fork is loaded */ - error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); - if (error) - return error; +/* + * Try to grab the IOLOCK and ILOCK of sc->ip and ip, returning @ip's lock + * state. The caller may have a transaction, so we must use trylock for both + * IOLOCKs. + */ +static inline unsigned int +xchk_dir_trylock_both( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + return 0; - error = xfs_dir2_isblock(&args, &isblock); - if (error) - return error; + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + goto parent_iolock; - if (isblock) { - error = xfs_dir2_block_lookup(&args); - goto out_check_rval; - } + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + goto parent_ilock; - error = xfs_dir2_isleaf(&args, &isleaf); - if (error) - return error; + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; + +parent_ilock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); +parent_iolock: + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* + * Try for a limited time to grab the IOLOCK and ILOCK of both the scrub target + * (@sc->ip) and the inode at the other end (@ip) of a directory or parent + * pointer link so that we can check that link. + * + * We do not know ahead of time that the directory tree is /not/ corrupt, so we + * cannot use the "lock two inode" functions because we do not know that there + * is not a racing thread trying to take the locks in opposite order. First + * take IOLOCK_EXCL of the scrub target, and then try to take IOLOCK_SHARED + * of @ip to synchronize with the VFS. Next, take ILOCK_EXCL of the scrub + * target and @ip to synchronize with XFS. + * + * If the trylocks succeed, *lockmode will be set to the locks held for @ip; + * @sc->ilock_flags will be set for the locks held for @sc->ip; and zero will + * be returned. If not, returns -EDEADLOCK to try again; or -ETIMEDOUT if + * XCHK_TRY_HARDER was set. Returns -EINTR if the process has been killed. + */ +int +xchk_dir_trylock_for_pptrs( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int *lockmode) +{ + unsigned int nr; + int error = 0; + + ASSERT(sc->ilock_flags == 0); + + for (nr = 0; nr < HZ; nr++) { + *lockmode = xchk_dir_trylock_both(sc, ip); + if (*lockmode) + return 0; - if (isleaf) { - error = xfs_dir2_leaf_lookup(&args); - goto out_check_rval; + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); } - error = xfs_dir2_node_lookup(&args); + if (sc->flags & XCHK_TRY_HARDER) { + xchk_set_incomplete(sc); + return -ETIMEDOUT; + } -out_check_rval: - if (error == -EEXIST) - error = 0; - if (!error) - *ino = args.inumber; - return error; + return -EDEADLOCK; } diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h index 55787f4df123..da501877a64d 100644 --- a/fs/xfs/scrub/readdir.h +++ b/fs/xfs/scrub/readdir.h @@ -16,4 +16,7 @@ int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp, int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *ino); +int xchk_dir_trylock_for_pptrs(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int *lockmode); + #endif /* __XFS_SCRUB_READDIR_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 0252a3b5b65a..be283153c254 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -211,6 +211,48 @@ static inline void xreap_defer_finish_reset(struct xreap_state *rs) rs->force_roll = false; } +/* + * Compute the maximum length of a buffer cache scan (in units of sectors), + * given a quantity of fs blocks. + */ +xfs_daddr_t +xrep_bufscan_max_sectors( + struct xfs_mount *mp, + xfs_extlen_t fsblocks) +{ + int max_fsbs; + + /* Remote xattr values are the largest buffers that we support. */ + max_fsbs = xfs_attr3_max_rmt_blocks(mp); + + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); +} + +/* + * Return an incore buffer from a sector scan, or NULL if there are no buffers + * left to return. + */ +struct xfs_buf * +xrep_bufscan_advance( + struct xfs_mount *mp, + struct xrep_bufscan *scan) +{ + scan->__sector_count += scan->daddr_step; + while (scan->__sector_count <= scan->max_sectors) { + struct xfs_buf *bp = NULL; + int error; + + error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr, + scan->__sector_count, XBF_LIVESCAN, &bp); + if (!error) + return bp; + + scan->__sector_count += scan->daddr_step; + } + + return NULL; +} + /* Try to invalidate the incore buffers for an extent that we're freeing. */ STATIC void xreap_agextent_binval( @@ -241,28 +283,15 @@ xreap_agextent_binval( * of any plausible size. */ while (bno < agbno_next) { - xfs_agblock_t fsbcount; - xfs_agblock_t max_fsbs; - - /* - * Max buffer size is the max remote xattr buffer size, which - * is one fs block larger than 64k. - */ - max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, - xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); - - for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) { - struct xfs_buf *bp = NULL; - xfs_daddr_t daddr; - int error; - - daddr = XFS_AGB_TO_DADDR(mp, agno, bno); - error = xfs_buf_incore(mp->m_ddev_targp, daddr, - XFS_FSB_TO_BB(mp, fsbcount), - XBF_LIVESCAN, &bp); - if (error) - continue; - + struct xrep_bufscan scan = { + .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .max_sectors = xrep_bufscan_max_sectors(mp, + agbno_next - bno), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); rs->invalidated++; @@ -646,3 +675,375 @@ xrep_reap_fsblocks( return 0; } + +/* + * Metadata files are not supposed to share blocks with anything else. + * If blocks are shared, we remove the reverse mapping (thus reducing the + * crosslink factor); if blocks are not shared, we also need to free them. + * + * This first step determines the longest subset of the passed-in imap + * (starting at its beginning) that is either crosslinked or not crosslinked. + * The blockcount will be adjust down as needed. + */ +STATIC int +xreap_bmapi_select( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap, + bool *crosslinked) +{ + struct xfs_owner_info oinfo; + struct xfs_btree_cur *cur; + xfs_filblks_t len = 1; + xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_agblock_t agbno_next; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); + agbno_next = agbno + imap->br_blockcount; + + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + + xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); + if (error) + goto out_cur; + + bno = agbno + 1; + while (bno < agbno_next) { + bool also_crosslinked; + + oinfo.oi_offset++; + error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (also_crosslinked != *crosslinked) + break; + + len++; + bno++; + } + + imap->br_blockcount = len; + trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Decide if this buffer can be joined to a transaction. This is true for most + * buffers, but there are two cases that we want to catch: large remote xattr + * value buffers are not logged and can overflow the buffer log item dirty + * bitmap size; and oversized cached buffers if things have really gone + * haywire. + */ +static inline bool +xreap_buf_loggable( + const struct xfs_buf *bp) +{ + int i; + + for (i = 0; i < bp->b_map_count; i++) { + int chunks; + int map_size; + + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + if (map_size > XFS_BLF_DATAMAP_SIZE) + return false; + } + + return true; +} + +/* + * Invalidate any buffers for this file mapping. The @imap blockcount may be + * adjusted downward if we need to roll the transaction. + */ +STATIC int +xreap_bmapi_binval( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; + int bmap_flags = xfs_bmapi_aflag(whichfork); + xfs_fileoff_t off; + xfs_fileoff_t max_off; + xfs_extlen_t scan_blocks; + xfs_agnumber_t agno = sc->sa.pag->pag_agno; + xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_agblock_t agbno_next; + unsigned int invalidated = 0; + int error; + + /* + * Avoid invalidating AG headers and post-EOFS blocks because we never + * own those. + */ + agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); + agbno_next = agbno + imap->br_blockcount; + if (!xfs_verify_agbno(pag, agbno) || + !xfs_verify_agbno(pag, agbno_next - 1)) + return 0; + + /* + * Buffers for file blocks can span multiple contiguous mappings. This + * means that for each block in the mapping, there could exist an + * xfs_buf indexed by that block with any length up to the maximum + * buffer size (remote xattr values) or to the next hole in the fork. + * To set up our binval scan, first we need to figure out the location + * of the next hole. + */ + off = imap->br_startoff + imap->br_blockcount; + max_off = off + xfs_attr3_max_rmt_blocks(mp); + while (off < max_off) { + struct xfs_bmbt_irec hmap; + int nhmaps = 1; + + error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + &nhmaps, bmap_flags); + if (error) + return error; + if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(0); + return -EFSCORRUPTED; + } + + if (!xfs_bmap_is_real_extent(&hmap)) + break; + + off = hmap.br_startoff + hmap.br_blockcount; + } + scan_blocks = off - imap->br_startoff; + + trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks); + + /* + * If there are incore buffers for these blocks, invalidate them. If + * we can't (try)lock the buffer we assume it's owned by someone else + * and leave it alone. The buffer cache cannot detect aliasing, so + * employ nested loops to detect incore buffers of any plausible size. + */ + while (bno < agbno_next) { + struct xrep_bufscan scan = { + .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .max_sectors = xrep_bufscan_max_sectors(mp, + scan_blocks), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { + if (xreap_buf_loggable(bp)) { + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + } else { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + invalidated++; + + /* + * Stop invalidating if we've hit the limit; we should + * still have enough reservation left to free however + * much of the mapping we've seen so far. + */ + if (invalidated > XREAP_MAX_BINVAL) { + imap->br_blockcount = agbno_next - bno; + goto out; + } + } + + bno++; + scan_blocks--; + } + +out: + trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount); + return 0; +} + +/* + * Dispose of as much of the beginning of this file fork mapping as possible. + * The number of blocks disposed of is returned in @imap->br_blockcount. + */ +STATIC int +xrep_reap_bmapi_iter( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap, + bool crosslinked) +{ + int error; + + if (crosslinked) { + /* + * If there are other rmappings, this block is cross linked and + * must not be freed. Remove the reverse mapping, leave the + * buffer cache in its possibly confused state, and move on. + * We don't want to risk discarding valid data buffers from + * anybody else who thinks they own the block, even though that + * runs the risk of stale buffer warnings in the future. + */ + trace_xreap_dispose_unmap_extent(sc->sa.pag, + XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), + imap->br_blockcount); + + /* + * Schedule removal of the mapping from the fork. We use + * deferred log intents in this function to control the exact + * sequence of metadata updates. + */ + xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + -(int64_t)imap->br_blockcount); + xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + return 0; + } + + /* + * If the block is not crosslinked, we can invalidate all the incore + * buffers for the extent, and then free the extent. This is a bit of + * a mess since we don't detect discontiguous buffers that are indexed + * by a block starting before the first block of the extent but overlap + * anyway. + */ + trace_xreap_dispose_free_extent(sc->sa.pag, + XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), + imap->br_blockcount); + + /* + * Invalidate as many buffers as we can, starting at the beginning of + * this mapping. If this function sets blockcount to zero, the + * transaction is full of logged buffer invalidations, so we need to + * return early so that we can roll and retry. + */ + error = xreap_bmapi_binval(sc, ip, whichfork, imap); + if (error || imap->br_blockcount == 0) + return error; + + /* + * Schedule removal of the mapping from the fork. We use deferred log + * intents in this function to control the exact sequence of metadata + * updates. + */ + xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + -(int64_t)imap->br_blockcount); + return xfs_free_extent_later(sc->tp, imap->br_startblock, + imap->br_blockcount, NULL, XFS_AG_RESV_NONE, true); +} + +/* + * Dispose of as much of this file extent as we can. Upon successful return, + * the imap will reflect the mapping that was removed from the fork. + */ +STATIC int +xreap_ifork_extent( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + xfs_agnumber_t agno; + bool crosslinked; + int error; + + ASSERT(sc->sa.pag == NULL); + + trace_xreap_ifork_extent(sc, ip, whichfork, imap); + + agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + /* + * Decide the fate of the blocks at the beginning of the mapping, then + * update the mapping to use it with the unmap calls. + */ + error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + if (error) + goto out_agf; + + error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + if (error) + goto out_agf; + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of each block mapped to the given fork of the given file. Callers + * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork + * must not have any delalloc reservations. + */ +int +xrep_reap_ifork( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork) +{ + xfs_fileoff_t off = 0; + int bmap_flags = xfs_bmapi_aflag(whichfork); + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(ip == sc->ip || ip == sc->tempip); + ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + + while (off < XFS_MAX_FILEOFF) { + struct xfs_bmbt_irec imap; + int nimaps = 1; + + /* Read the next extent, skip past holes and delalloc. */ + error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap, + &nimaps, bmap_flags); + if (error) + return error; + if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(0); + return -EFSCORRUPTED; + } + + /* + * If this is a real space mapping, reap as much of it as we + * can in a single transaction. + */ + if (xfs_bmap_is_real_extent(&imap)) { + error = xreap_ifork_extent(sc, ip, whichfork, &imap); + if (error) + return error; + + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + off = imap.br_startoff + imap.br_blockcount; + } + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index 0b69f16dd98f..3f2f1775e29d 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -13,5 +13,26 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, const struct xfs_owner_info *oinfo); +int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork); + +/* Buffer cache scan context. */ +struct xrep_bufscan { + /* Disk address for the buffers we want to scan. */ + xfs_daddr_t daddr; + + /* Maximum number of sectors to scan. */ + xfs_daddr_t max_sectors; + + /* Each round, increment the search length by this number of sectors. */ + xfs_daddr_t daddr_step; + + /* Internal scan state; initialize to zero. */ + xfs_daddr_t __sector_count; +}; + +xfs_daddr_t xrep_bufscan_max_sectors(struct xfs_mount *mp, + xfs_extlen_t fsblocks); +struct xfs_buf *xrep_bufscan_advance(struct xfs_mount *mp, + struct xrep_bufscan *scan); #endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index f43dce771cdd..67478294f11a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -32,6 +32,10 @@ #include "xfs_reflink.h" #include "xfs_health.h" #include "xfs_buf_mem.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_dir2.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -39,6 +43,7 @@ #include "scrub/bitmap.h" #include "scrub/stats.h" #include "scrub/xfile.h" +#include "scrub/attr_repair.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -290,7 +295,7 @@ xrep_calc_ag_resblks( icount = pag->pagi_count; } else { /* Try to get the actual counters from disk. */ - error = xfs_ialloc_read_agi(pag, NULL, &bp); + error = xfs_ialloc_read_agi(pag, NULL, 0, &bp); if (!error) { icount = pag->pagi_count; xfs_buf_relse(bp); @@ -724,7 +729,7 @@ xrep_update_qflags( xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1); no_update: - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); } /* Force a quotacheck the next time we mount. */ @@ -908,7 +913,7 @@ xrep_reinit_pagi( ASSERT(xfs_perag_initialised_agi(pag)); clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); - error = xfs_ialloc_read_agi(pag, sc->tp, &bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &bp); if (error) return error; @@ -934,7 +939,7 @@ xrep_ag_init( ASSERT(!sa->pag); - error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &sa->agi_bp); if (error) return error; @@ -963,9 +968,7 @@ xrep_reset_perag_resv( ASSERT(sc->tp); sc->flags &= ~XREP_RESET_PERAG_RESV; - error = xfs_ag_resv_free(sc->sa.pag); - if (error) - goto out; + xfs_ag_resv_free(sc->sa.pag); error = xfs_ag_resv_init(sc->sa.pag, sc->tp); if (error == -ENOSPC) { xfs_err(sc->mp, @@ -974,7 +977,6 @@ xrep_reset_perag_resv( error = 0; } -out: return error; } @@ -1004,55 +1006,27 @@ xrep_metadata_inode_subtype( struct xfs_scrub *sc, unsigned int scrub_type) { - __u32 smtype = sc->sm->sm_type; - __u32 smflags = sc->sm->sm_flags; - unsigned int sick_mask = sc->sick_mask; + struct xfs_scrub_subord *sub; int error; /* - * Let's see if the inode needs repair. We're going to open-code calls - * to the scrub and repair functions so that we can hang on to the + * Let's see if the inode needs repair. Use a subordinate scrub context + * to call the scrub and repair functions so that we can hang on to the * resources that we already acquired instead of using the standard * setup/teardown routines. */ - sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; - sc->sm->sm_type = scrub_type; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - case XFS_SCRUB_TYPE_BMBTA: - error = xchk_bmap_attr(sc); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - } + sub = xchk_scrub_create_subord(sc, scrub_type); + error = sub->sc.ops->scrub(&sub->sc); if (error) goto out; - - if (!xrep_will_attempt(sc)) + if (!xrep_will_attempt(&sub->sc)) goto out; /* * Repair some part of the inode. This will potentially join the inode * to the transaction. */ - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xrep_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xrep_bmap(sc, XFS_DATA_FORK, false); - break; - case XFS_SCRUB_TYPE_BMBTA: - error = xrep_bmap(sc, XFS_ATTR_FORK, false); - break; - } + error = sub->sc.ops->repair(&sub->sc); if (error) goto out; @@ -1061,10 +1035,10 @@ xrep_metadata_inode_subtype( * that the inode will not be joined to the transaction when we exit * the function. */ - error = xfs_defer_finish(&sc->tp); + error = xfs_defer_finish(&sub->sc.tp); if (error) goto out; - error = xfs_trans_roll(&sc->tp); + error = xfs_trans_roll(&sub->sc.tp); if (error) goto out; @@ -1072,31 +1046,18 @@ xrep_metadata_inode_subtype( * Clear the corruption flags and re-check the metadata that we just * repaired. */ - sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - case XFS_SCRUB_TYPE_BMBTA: - error = xchk_bmap_attr(sc); - break; - } + sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + error = sub->sc.ops->scrub(&sub->sc); if (error) goto out; /* If corruption persists, the repair has failed. */ - if (xchk_needs_repair(sc->sm)) { + if (xchk_needs_repair(sub->sc.sm)) { error = -EFSCORRUPTED; goto out; } out: - sc->sick_mask = sick_mask; - sc->sm->sm_type = smtype; - sc->sm->sm_flags = smflags; + xchk_scrub_free_subord(sub); return error; } @@ -1136,6 +1097,17 @@ xrep_metadata_inode_forks( return error; } + /* Clear the attr forks since metadata shouldn't have that. */ + if (xfs_inode_hasattr(sc->ip)) { + if (!dirty) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + } + error = xrep_xattr_reset_fork(sc); + if (error) + return error; + } + /* * If we modified the inode, roll the transaction but don't rejoin the * inode to the new transaction because xrep_bmap_data can do that. @@ -1201,3 +1173,34 @@ xrep_trans_cancel_hook_dummy( current->journal_info = *cookiep; *cookiep = NULL; } + +/* + * See if this buffer can pass the given ->verify_struct() function. + * + * If the buffer already has ops attached and they're not the ones that were + * passed in, we reject the buffer. Otherwise, we perform the structure test + * (note that we do not check CRCs) and return the outcome of the test. The + * buffer ops and error state are left unchanged. + */ +bool +xrep_buf_verify_struct( + struct xfs_buf *bp, + const struct xfs_buf_ops *ops) +{ + const struct xfs_buf_ops *old_ops = bp->b_ops; + xfs_failaddr_t fa; + int old_error; + + if (old_ops) { + if (old_ops != ops) + return false; + } + + old_error = bp->b_error; + bp->b_ops = ops; + fa = bp->b_ops->verify_struct(bp); + bp->b_ops = old_ops; + bp->b_error = old_error; + + return fa == NULL; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index ce082d941459..0e0dc2bf985c 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -90,6 +90,12 @@ int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); int xrep_metadata_inode_forks(struct xfs_scrub *sc); int xrep_setup_ag_rmapbt(struct xfs_scrub *sc); int xrep_setup_ag_refcountbt(struct xfs_scrub *sc); +int xrep_setup_xattr(struct xfs_scrub *sc); +int xrep_setup_directory(struct xfs_scrub *sc); +int xrep_setup_parent(struct xfs_scrub *sc); +int xrep_setup_nlinks(struct xfs_scrub *sc); +int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks); +int xrep_setup_dirtree(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -123,11 +129,18 @@ int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_bmap_cow(struct xfs_scrub *sc); int xrep_nlinks(struct xfs_scrub *sc); int xrep_fscounters(struct xfs_scrub *sc); +int xrep_xattr(struct xfs_scrub *sc); +int xrep_directory(struct xfs_scrub *sc); +int xrep_parent(struct xfs_scrub *sc); +int xrep_symlink(struct xfs_scrub *sc); +int xrep_dirtree(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); +int xrep_rtsummary(struct xfs_scrub *sc); #else # define xrep_rtbitmap xrep_notsupported +# define xrep_rtsummary xrep_notsupported #endif /* CONFIG_XFS_RT */ #ifdef CONFIG_XFS_QUOTA @@ -145,6 +158,8 @@ int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, struct xfs_trans **tpp); void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); +bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); + #else #define xrep_ino_dqattach(sc) (0) @@ -188,9 +203,19 @@ xrep_setup_nothing( #define xrep_setup_ag_allocbt xrep_setup_nothing #define xrep_setup_ag_rmapbt xrep_setup_nothing #define xrep_setup_ag_refcountbt xrep_setup_nothing +#define xrep_setup_xattr xrep_setup_nothing +#define xrep_setup_directory xrep_setup_nothing +#define xrep_setup_parent xrep_setup_nothing +#define xrep_setup_nlinks xrep_setup_nothing +#define xrep_setup_dirtree xrep_setup_nothing #define xrep_setup_inode(sc, imap) ((void)0) +static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x) +{ + return 0; +} + #define xrep_revalidate_allocbt (NULL) #define xrep_revalidate_iallocbt (NULL) @@ -212,6 +237,12 @@ xrep_setup_nothing( #define xrep_quotacheck xrep_notsupported #define xrep_nlinks xrep_notsupported #define xrep_fscounters xrep_notsupported +#define xrep_rtsummary xrep_notsupported +#define xrep_xattr xrep_notsupported +#define xrep_directory xrep_notsupported +#define xrep_parent xrep_notsupported +#define xrep_symlink xrep_notsupported +#define xrep_dirtree xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index e8e07b683eab..e8080eba37d2 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -432,14 +432,6 @@ out: return error; } -static inline bool -is_rt_data_fork( - struct xfs_inode *ip, - int whichfork) -{ - return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK; -} - /* * Iterate the block mapping btree to collect rmap records for anything in this * fork that matches the AG. Sets @mappings_done to true if we've scanned the @@ -578,23 +570,9 @@ xrep_rmap_scan_inode( struct xrep_rmap *rr, struct xfs_inode *ip) { - unsigned int lock_mode = 0; + unsigned int lock_mode = xrep_rmap_scan_ilock(ip); int error; - /* - * Directory updates (create/link/unlink/rename) drop the directory's - * ILOCK before finishing any rmapbt updates associated with directory - * shape changes. For this scan to coordinate correctly with the live - * update hook, we must take the only lock (i_rwsem) that is held all - * the way to dir op completion. This will get fixed by the parent - * pointer patchset. - */ - if (S_ISDIR(VFS_I(ip)->i_mode)) { - lock_mode = XFS_IOLOCK_SHARED; - xfs_ilock(ip, lock_mode); - } - lock_mode |= xrep_rmap_scan_ilock(ip); - /* Check the data fork. */ error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); if (error) diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c index 46f5d5f605c9..0fef98e9f834 100644 --- a/fs/xfs/scrub/rtbitmap_repair.c +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -108,8 +108,6 @@ xrep_rtbitmap_data_mappings( 0, &map, &nmaps); if (error) return error; - if (nmaps != 1) - return -EFSCORRUPTED; /* Commit new extent and all deferred work. */ error = xrep_defer_finish(sc); diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 5055092bd9e8..3fee603f5244 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -17,10 +17,14 @@ #include "xfs_bit.h" #include "xfs_bmap.h" #include "xfs_sb.h" +#include "xfs_exchmaps.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/xfile.h" +#include "scrub/repair.h" +#include "scrub/tempexch.h" +#include "scrub/rtsummary.h" /* * Realtime Summary @@ -32,18 +36,6 @@ * (potentially large) amount of data in pageable memory. */ -struct xchk_rtsummary { - struct xfs_rtalloc_args args; - - uint64_t rextents; - uint64_t rbmblocks; - uint64_t rsumsize; - unsigned int rsumlevels; - - /* Memory buffer for the summary comparison. */ - union xfs_suminfo_raw words[]; -}; - /* Set us up to check the rtsummary file. */ int xchk_setup_rtsummary( @@ -60,6 +52,12 @@ xchk_setup_rtsummary( return -ENOMEM; sc->buf = rts; + if (xchk_could_repair(sc)) { + error = xrep_setup_rtsummary(sc, rts); + if (error) + return error; + } + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. @@ -70,7 +68,7 @@ xchk_setup_rtsummary( if (error) return error; - error = xchk_trans_alloc(sc, 0); + error = xchk_trans_alloc(sc, rts->resblks); if (error) return error; @@ -135,7 +133,7 @@ xfsum_store( sumoff << XFS_WORDLOG); } -static inline int +inline int xfsum_copyout( struct xfs_scrub *sc, xfs_rtsumoff_t sumoff, @@ -362,7 +360,12 @@ xchk_rtsummary( error = xchk_rtsum_compare(sc); out_rbm: - /* Unlock the rtbitmap since we're done with it. */ + /* + * Unlock the rtbitmap since we're done with it. All other writers of + * the rt free space metadata grab the bitmap and summary ILOCKs in + * that order, so we're still protected against allocation activities + * even if we continue on to the repair function. + */ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h new file mode 100644 index 000000000000..e1d50304d8d4 --- /dev/null +++ b/fs/xfs/scrub/rtsummary.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTSUMMARY_H__ +#define __XFS_SCRUB_RTSUMMARY_H__ + +struct xchk_rtsummary { +#ifdef CONFIG_XFS_ONLINE_REPAIR + struct xrep_tempexch tempexch; +#endif + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + uint64_t rsumsize; + unsigned int rsumlevels; + unsigned int resblks; + + /* suminfo position of xfile as we write buffers to disk. */ + xfs_rtsumoff_t prep_wordoff; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + +int xfsum_copyout(struct xfs_scrub *sc, xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo, unsigned int nr_words); + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtsummary(struct xfs_scrub *sc, struct xchk_rtsummary *rts); +#else +# define xrep_setup_rtsummary(sc, rts) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTSUMMARY_H__ */ diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c new file mode 100644 index 000000000000..d9e971c4c79f --- /dev/null +++ b/fs/xfs/scrub/rtsummary_repair.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" +#include "scrub/xfile.h" +#include "scrub/rtsummary.h" + +/* Set us up to repair the rtsummary file. */ +int +xrep_setup_rtsummary( + struct xfs_scrub *sc, + struct xchk_rtsummary *rts) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks; + int error; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + /* + * If we're doing a repair, we reserve enough blocks to write out a + * completely new summary file, plus twice as many blocks as we would + * need if we can only allocate one block per data fork mapping. This + * should cover the preallocation of the temporary file and exchanging + * the extent mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement rtsummary and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * rtsummary ILOCK) and cannot ask for more reservation. + */ + blocks = XFS_B_TO_FSB(mp, mp->m_rsumsize); + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rts->resblks += blocks; + return 0; +} + +static int +xrep_rtsummary_prep_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp, + void *data) +{ + struct xchk_rtsummary *rts = data; + struct xfs_mount *mp = sc->mp; + union xfs_suminfo_raw *ondisk; + int error; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; + rts->args.sumbp = bp; + ondisk = xfs_rsumblock_infoptr(&rts->args, 0); + rts->args.sumbp = NULL; + + bp->b_ops = &xfs_rtbuf_ops; + + error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize); + if (error) + return error; + + rts->prep_wordoff += mp->m_blockwsize; + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF); + return 0; +} + +/* Repair the realtime summary. */ +int +xrep_rtsummary( + struct xfs_scrub *sc) +{ + struct xchk_rtsummary *rts = sc->buf; + struct xfs_mount *mp = sc->mp; + xfs_filblks_t rsumblocks; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(mp)) + return -EOPNOTSUPP; + + /* Walk away if we disagree on the size of the rt bitmap. */ + if (rts->rbmblocks != mp->m_sb.sb_rbmblocks) + return 0; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + /* + * Try to take ILOCK_EXCL of the temporary file. We had better be the + * only ones holding onto this inode, but we can't block while holding + * the rtsummary file's ILOCK_EXCL. + */ + while (!xrep_tempfile_ilock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + /* Make sure we have space allocated for the entire summary file. */ + rsumblocks = XFS_B_TO_FSB(mp, rts->rsumsize); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + error = xrep_tempfile_prealloc(sc, 0, rsumblocks); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Copy the rtsummary file that we generated. */ + error = xrep_tempfile_copyin(sc, 0, rsumblocks, + xrep_rtsummary_prep_buf, rts); + if (error) + return error; + error = xrep_tempfile_set_isize(sc, rts->rsumsize); + if (error) + return error; + + /* + * Now exchange the contents. Nothing in repair uses the temporary + * buffer, so we can reuse it for the tempfile exchrange information. + */ + error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, &rts->tempexch); + if (error) + return error; + + error = xrep_tempexch_contents(sc, &rts->tempexch); + if (error) + return error; + + /* Reset incore state and blow out the summary cache. */ + if (mp->m_rsum_cache) + memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); + + mp->m_rsumlevels = rts->rsumlevels; + mp->m_rsumsize = rts->rsumsize; + + /* Free the old rtsummary blocks if they're not in use. */ + return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 20fac9723c08..c013f0ba4f36 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -17,6 +17,11 @@ #include "xfs_scrub.h" #include "xfs_buf_mem.h" #include "xfs_rmap.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -24,6 +29,8 @@ #include "scrub/health.h" #include "scrub/stats.h" #include "scrub/xfile.h" +#include "scrub/tempfile.h" +#include "scrub/orphanage.h" /* * Online Scrub and Repair @@ -171,6 +178,39 @@ xchk_fsgates_disable( sc->flags &= ~XCHK_FSGATES_ALL; } +/* Free the resources associated with a scrub subtype. */ +void +xchk_scrub_free_subord( + struct xfs_scrub_subord *sub) +{ + struct xfs_scrub *sc = sub->parent_sc; + + ASSERT(sc->ip == sub->sc.ip); + ASSERT(sc->orphanage == sub->sc.orphanage); + ASSERT(sc->tempip == sub->sc.tempip); + + sc->sm->sm_type = sub->old_smtype; + sc->sm->sm_flags = sub->old_smflags | + (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT); + sc->tp = sub->sc.tp; + + if (sub->sc.buf) { + if (sub->sc.buf_cleanup) + sub->sc.buf_cleanup(sub->sc.buf); + kvfree(sub->sc.buf); + } + if (sub->sc.xmbtp) + xmbuf_free(sub->sc.xmbtp); + if (sub->sc.xfile) + xfile_destroy(sub->sc.xfile); + + sc->ilock_flags = sub->sc.ilock_flags; + sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags; + sc->temp_ilock_flags = sub->sc.temp_ilock_flags; + + kfree(sub); +} + /* Free all the resources and finish the transactions. */ STATIC int xchk_teardown( @@ -211,6 +251,8 @@ xchk_teardown( sc->buf = NULL; } + xrep_tempfile_rele(sc); + xrep_orphanage_rele(sc); xchk_fsgates_disable(sc); return error; } @@ -319,25 +361,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_INODE, .setup = xchk_setup_directory, .scrub = xchk_directory, - .repair = xrep_notsupported, + .repair = xrep_directory, }, [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ .type = ST_INODE, .setup = xchk_setup_xattr, .scrub = xchk_xattr, - .repair = xrep_notsupported, + .repair = xrep_xattr, }, [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ .type = ST_INODE, .setup = xchk_setup_symlink, .scrub = xchk_symlink, - .repair = xrep_notsupported, + .repair = xrep_symlink, }, [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ .type = ST_INODE, .setup = xchk_setup_parent, .scrub = xchk_parent, - .repair = xrep_notsupported, + .repair = xrep_parent, }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_FS, @@ -349,7 +391,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .repair = xrep_notsupported, + .repair = xrep_rtsummary, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, @@ -393,6 +435,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_health_record, .repair = xrep_notsupported, }, + [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */ + .type = ST_INODE, + .setup = xchk_setup_dirtree, + .scrub = xchk_dirtree, + .has = xfs_has_parent, + .repair = xrep_dirtree, + }, }; static int @@ -497,8 +546,38 @@ static inline void xchk_postmortem(struct xfs_scrub *sc) } #endif /* CONFIG_XFS_ONLINE_REPAIR */ +/* + * Create a new scrub context from an existing one, but with a different scrub + * type. + */ +struct xfs_scrub_subord * +xchk_scrub_create_subord( + struct xfs_scrub *sc, + unsigned int subtype) +{ + struct xfs_scrub_subord *sub; + + sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS); + if (!sub) + return ERR_PTR(-ENOMEM); + + sub->old_smtype = sc->sm->sm_type; + sub->old_smflags = sc->sm->sm_flags; + sub->parent_sc = sc; + memcpy(&sub->sc, sc, sizeof(struct xfs_scrub)); + sub->sc.ops = &meta_scrub_ops[subtype]; + sub->sc.sm->sm_type = subtype; + sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sub->sc.buf = NULL; + sub->sc.buf_cleanup = NULL; + sub->sc.xfile = NULL; + sub->sc.xmbtp = NULL; + + return sub; +} + /* Dispatch metadata scrubbing. */ -int +STATIC int xfs_scrub_metadata( struct file *file, struct xfs_scrub_metadata *sm) @@ -540,6 +619,7 @@ xfs_scrub_metadata( sc->sm = sm; sc->ops = &meta_scrub_ops[sm->sm_type]; sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); + sc->relax = INIT_XCHK_RELAX; retry_op: /* * When repairs are allowed, prevent freezing or readonly remount while @@ -643,3 +723,221 @@ try_harder: run.retries++; goto retry_op; } + +/* Scrub one aspect of one piece of metadata. */ +int +xfs_ioc_scrub_metadata( + struct file *file, + void __user *arg) +{ + struct xfs_scrub_metadata scrub; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&scrub, arg, sizeof(scrub))) + return -EFAULT; + + error = xfs_scrub_metadata(file, &scrub); + if (error) + return error; + + if (copy_to_user(arg, &scrub, sizeof(scrub))) + return -EFAULT; + + return 0; +} + +/* Decide if there have been any scrub failures up to this point. */ +static inline int +xfs_scrubv_check_barrier( + struct xfs_mount *mp, + const struct xfs_scrub_vec *vectors, + const struct xfs_scrub_vec *stop_vec) +{ + const struct xfs_scrub_vec *v; + __u32 failmask; + + failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT; + + for (v = vectors; v < stop_vec; v++) { + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) + continue; + + /* + * Runtime errors count as a previous failure, except the ones + * used to ask userspace to retry. + */ + switch (v->sv_ret) { + case -EBUSY: + case -ENOENT: + case -EUSERS: + case 0: + break; + default: + return -ECANCELED; + } + + /* + * If any of the out-flags on the scrub vector match the mask + * that was set on the barrier vector, that's a previous fail. + */ + if (v->sv_flags & failmask) + return -ECANCELED; + } + + return 0; +} + +/* + * If the caller provided us with a nonzero inode number that isn't the ioctl + * file, try to grab a reference to it to eliminate all further untrusted inode + * lookups. If we can't get the inode, let each scrub function try again. + */ +STATIC struct xfs_inode * +xchk_scrubv_open_by_handle( + struct xfs_mount *mp, + const struct xfs_scrub_vec_head *head) +{ + struct xfs_trans *tp; + struct xfs_inode *ip; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return NULL; + + error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); + xfs_trans_cancel(tp); + if (error) + return NULL; + + if (VFS_I(ip)->i_generation != head->svh_gen) { + xfs_irele(ip); + return NULL; + } + + return ip; +} + +/* Vectored scrub implementation to reduce ioctl calls. */ +int +xfs_ioc_scrubv_metadata( + struct file *file, + void __user *arg) +{ + struct xfs_scrub_vec_head head; + struct xfs_scrub_vec_head __user *uhead = arg; + struct xfs_scrub_vec *vectors; + struct xfs_scrub_vec __user *uvectors; + struct xfs_inode *ip_in = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip_in->i_mount; + struct xfs_inode *handle_ip = NULL; + struct xfs_scrub_vec *v; + size_t vec_bytes; + unsigned int i; + int error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&head, uhead, sizeof(head))) + return -EFAULT; + + if (head.svh_reserved) + return -EINVAL; + if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL) + return -EINVAL; + if (head.svh_nr == 0) + return 0; + + vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec)); + if (vec_bytes > PAGE_SIZE) + return -ENOMEM; + + uvectors = (void __user *)(uintptr_t)head.svh_vectors; + vectors = memdup_user(uvectors, vec_bytes); + if (IS_ERR(vectors)) + return PTR_ERR(vectors); + + trace_xchk_scrubv_start(ip_in, &head); + + for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { + if (v->sv_reserved) { + error = -EINVAL; + goto out_free; + } + + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER && + (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) { + error = -EINVAL; + goto out_free; + } + + trace_xchk_scrubv_item(mp, &head, i, v); + } + + /* + * If the caller wants us to do a scrub-by-handle and the file used to + * call the ioctl is not the same file, load the incore inode and pin + * it across all the scrubv actions to avoid repeated UNTRUSTED + * lookups. The reference is not passed to deeper layers of scrub + * because each scrubber gets to decide its own strategy and return + * values for getting an inode. + */ + if (head.svh_ino && head.svh_ino != ip_in->i_ino) + handle_ip = xchk_scrubv_open_by_handle(mp, &head); + + /* Run all the scrubbers. */ + for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { + struct xfs_scrub_metadata sm = { + .sm_type = v->sv_type, + .sm_flags = v->sv_flags, + .sm_ino = head.svh_ino, + .sm_gen = head.svh_gen, + .sm_agno = head.svh_agno, + }; + + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) { + v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v); + if (v->sv_ret) { + trace_xchk_scrubv_barrier_fail(mp, &head, i, v); + break; + } + + continue; + } + + v->sv_ret = xfs_scrub_metadata(file, &sm); + v->sv_flags = sm.sm_flags; + + trace_xchk_scrubv_outcome(mp, &head, i, v); + + if (head.svh_rest_us) { + ktime_t expires; + + expires = ktime_add_ns(ktime_get(), + head.svh_rest_us * 1000); + set_current_state(TASK_KILLABLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + + if (fatal_signal_pending(current)) { + error = -EINTR; + goto out_free; + } + } + + if (copy_to_user(uvectors, vectors, vec_bytes) || + copy_to_user(uhead, &head, sizeof(head))) { + error = -EFAULT; + goto out_free; + } + +out_free: + if (handle_ip) + xfs_irele(handle_ip); + kfree(vectors); + return error; +} diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 9ad65b604fe1..1bc33f010d0e 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -8,6 +8,49 @@ struct xfs_scrub; +struct xchk_relax { + unsigned long next_resched; + unsigned int resched_nr; + bool interruptible; +}; + +/* Yield to the scheduler at most 10x per second. */ +#define XCHK_RELAX_NEXT (jiffies + (HZ / 10)) + +#define INIT_XCHK_RELAX \ + (struct xchk_relax){ \ + .next_resched = XCHK_RELAX_NEXT, \ + .resched_nr = 0, \ + .interruptible = true, \ + } + +/* + * Relax during a scrub operation and exit if there's a fatal signal pending. + * + * If preemption is disabled, we need to yield to the scheduler every now and + * then so that we don't run afoul of the soft lockup watchdog or RCU stall + * detector. cond_resched calls are somewhat expensive (~5ns) so we want to + * ratelimit this to 10x per second. Amortize the cost of the other checks by + * only doing it once every 100 calls. + */ +static inline int xchk_maybe_relax(struct xchk_relax *widget) +{ + /* Amortize the cost of scheduling and checking signals. */ + if (likely(++widget->resched_nr < 100)) + return 0; + widget->resched_nr = 0; + + if (unlikely(widget->next_resched <= jiffies)) { + cond_resched(); + widget->next_resched = XCHK_RELAX_NEXT; + } + + if (widget->interruptible && fatal_signal_pending(current)) + return -EINTR; + + return 0; +} + /* * Standard flags for allocating memory within scrub. NOFS context is * configured by the process allocation scope. Scrub and repair must be able @@ -17,6 +60,13 @@ struct xfs_scrub; #define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ __GFP_RETRY_MAYFAIL)) +/* + * For opening files by handle for fsck operations, we don't trust the inumber + * or the allocation state; therefore, perform an untrusted lookup. We don't + * want these inodes to pollute the cache, so mark them for immediate removal. + */ +#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE) + /* Type info and names for the scrub types. */ enum xchk_type { ST_NONE = 1, /* disabled */ @@ -105,6 +155,14 @@ struct xfs_scrub { /* Lock flags for @ip. */ uint ilock_flags; + /* The orphanage, for stashing files that have lost their parent. */ + uint orphanage_ilock_flags; + struct xfs_inode *orphanage; + + /* A temporary file on this filesystem, for staging new metadata. */ + struct xfs_inode *tempip; + uint temp_ilock_flags; + /* See the XCHK/XREP state flags below. */ unsigned int flags; @@ -115,6 +173,9 @@ struct xfs_scrub { */ unsigned int sick_mask; + /* next time we want to cond_resched() */ + struct xchk_relax relax; + /* State tracking for single-AG operations. */ struct xchk_ag sa; }; @@ -141,6 +202,35 @@ struct xfs_scrub { XCHK_FSGATES_DIRENTS | \ XCHK_FSGATES_RMAP) +struct xfs_scrub_subord { + struct xfs_scrub sc; + struct xfs_scrub *parent_sc; + unsigned int old_smtype; + unsigned int old_smflags; +}; + +struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc, + unsigned int subtype); +void xchk_scrub_free_subord(struct xfs_scrub_subord *sub); + +/* + * We /could/ terminate a scrub/repair operation early. If we're not + * in a good place to continue (fatal signal, etc.) then bail out. + * Note that we're careful not to make any judgements about *error. + */ +static inline bool +xchk_should_terminate( + struct xfs_scrub *sc, + int *error) +{ + if (xchk_maybe_relax(&sc->relax)) { + if (*error == 0) + *error = -EINTR; + return true; + } + return false; +} + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -159,6 +249,7 @@ int xchk_directory(struct xfs_scrub *sc); int xchk_xattr(struct xfs_scrub *sc); int xchk_symlink(struct xfs_scrub *sc); int xchk_parent(struct xfs_scrub *sc); +int xchk_dirtree(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_rtbitmap(struct xfs_scrub *sc); int xchk_rtsummary(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index 42cafbed94ac..7996c2335476 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -79,6 +79,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", [XFS_SCRUB_TYPE_NLINKS] = "nlinks", + [XFS_SCRUB_TYPE_DIRTREE] = "dirtree", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index d77d8a9598f6..c848bcc07cd5 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -10,6 +10,7 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_symlink.h" #include "xfs_health.h" @@ -17,18 +18,28 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/health.h" +#include "scrub/repair.h" /* Set us up to scrub a symbolic link. */ int xchk_setup_symlink( struct xfs_scrub *sc) { + unsigned int resblks = 0; + int error; + /* Allocate the buffer without the inode lock held. */ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; - return xchk_setup_inode_contents(sc, 0); + if (xchk_could_repair(sc)) { + error = xrep_setup_symlink(sc, &resblks); + if (error) + return error; + } + + return xchk_setup_inode_contents(sc, resblks); } /* Symbolic links. */ diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c new file mode 100644 index 000000000000..d015a86ef460 --- /dev/null +++ b/fs/xfs/scrub/symlink_repair.c @@ -0,0 +1,509 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_symlink.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_symlink_remote.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" + +/* + * Symbolic Link Repair + * ==================== + * + * We repair symbolic links by reading whatever target data we can find, up to + * the first NULL byte. If the recovered target strlen matches i_size, then + * we rewrite the target. In all other cases, we replace the target with an + * overly long string that cannot possibly resolve. The new target is written + * into a private hidden temporary file, and then a file contents exchange + * commits the new symlink target to the file being repaired. + */ + +/* Set us up to repair the symlink file. */ +int +xrep_setup_symlink( + struct xfs_scrub *sc, + unsigned int *resblks) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks; + int error; + + error = xrep_tempfile_create(sc, S_IFLNK); + if (error) + return error; + + /* + * If we're doing a repair, we reserve enough blocks to write out a + * completely new symlink file, plus twice as many blocks as we would + * need if we can only allocate one block per data fork mapping. This + * should cover the preallocation of the temporary file and exchanging + * the extent mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement symlink and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * symlink ILOCK) and cannot ask for more reservation. + */ + blocks = xfs_symlink_blocks(sc->mp, XFS_SYMLINK_MAXLEN); + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + *resblks += blocks; + return 0; +} + +/* + * Try to salvage the pathname from remote blocks. Returns the number of bytes + * salvaged or a negative errno. + */ +STATIC ssize_t +xrep_symlink_salvage_remote( + struct xfs_scrub *sc) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_inode *ip = sc->ip; + struct xfs_buf *bp; + char *target_buf = sc->buf; + xfs_failaddr_t fa; + xfs_filblks_t fsblocks; + xfs_daddr_t d; + loff_t len; + loff_t offset = 0; + unsigned int byte_cnt; + bool magic_ok; + bool hdr_ok; + int n; + int nmaps = XFS_SYMLINK_MAPS; + int error; + + /* We'll only read until the buffer is full. */ + len = min_t(loff_t, ip->i_disk_size, XFS_SYMLINK_MAXLEN); + fsblocks = xfs_symlink_blocks(sc->mp, len); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + return error; + + for (n = 0; n < nmaps; n++) { + struct xfs_dsymlink_hdr *dsl; + + d = XFS_FSB_TO_DADDR(sc->mp, mval[n].br_startblock); + + /* Read the rmt block. We'll run the verifiers manually. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + d, XFS_FSB_TO_BB(sc->mp, mval[n].br_blockcount), + 0, &bp, NULL); + if (error) + return error; + bp->b_ops = &xfs_symlink_buf_ops; + + /* How many bytes do we expect to get out of this buffer? */ + byte_cnt = XFS_FSB_TO_B(sc->mp, mval[n].br_blockcount); + byte_cnt = XFS_SYMLINK_BUF_SPACE(sc->mp, byte_cnt); + byte_cnt = min_t(unsigned int, byte_cnt, len); + + /* + * See if the verifiers accept this block. We're willing to + * salvage if the if the offset/byte/ino are ok and either the + * verifier passed or the magic is ok. Anything else and we + * stop dead in our tracks. + */ + fa = bp->b_ops->verify_struct(bp); + dsl = bp->b_addr; + magic_ok = dsl->sl_magic == cpu_to_be32(XFS_SYMLINK_MAGIC); + hdr_ok = xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp); + if (!hdr_ok || (fa != NULL && !magic_ok)) + break; + + memcpy(target_buf + offset, dsl + 1, byte_cnt); + + len -= byte_cnt; + offset += byte_cnt; + } + return offset; +} + +/* + * Try to salvage an inline symlink's contents. Returns the number of bytes + * salvaged or a negative errno. + */ +STATIC ssize_t +xrep_symlink_salvage_inline( + struct xfs_scrub *sc) +{ + struct xfs_inode *ip = sc->ip; + char *target_buf = sc->buf; + char *old_target; + struct xfs_ifork *ifp; + unsigned int nr; + + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + if (!ifp->if_data) + return 0; + + /* + * If inode repair zapped the link target, pretend that we didn't find + * any bytes at all so that we can replace the (now totally lost) link + * target with a warning message. + */ + old_target = ifp->if_data; + if (xfs_inode_has_sickness(sc->ip, XFS_SICK_INO_SYMLINK_ZAPPED) && + sc->ip->i_disk_size == 1 && old_target[0] == '?') + return 0; + + nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); + strncpy(target_buf, ifp->if_data, nr); + return nr; +} + +#define DUMMY_TARGET \ + "The target of this symbolic link could not be recovered at all and " \ + "has been replaced with this explanatory message. To avoid " \ + "accidentally pointing to an existing file path, this message is " \ + "longer than the maximum supported file name length. That is an " \ + "acceptable length for a symlink target on XFS but will produce " \ + "File Name Too Long errors if resolved." + +/* Salvage whatever we can of the target. */ +STATIC int +xrep_symlink_salvage( + struct xfs_scrub *sc) +{ + char *target_buf = sc->buf; + ssize_t buflen = 0; + + BUILD_BUG_ON(sizeof(DUMMY_TARGET) - 1 <= NAME_MAX); + + /* + * Salvage the target if there weren't any corruption problems observed + * while scanning it. + */ + if (!(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + if (sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) + buflen = xrep_symlink_salvage_inline(sc); + else + buflen = xrep_symlink_salvage_remote(sc); + if (buflen < 0) + return buflen; + + /* + * NULL-terminate the buffer because the ondisk target does not + * do that for us. If salvage didn't find the exact amount of + * data that we expected to find, don't salvage anything. + */ + target_buf[buflen] = 0; + if (strlen(target_buf) != sc->ip->i_disk_size) + buflen = 0; + } + + /* + * Change an empty target into a dummy target and clear the symlink + * target zapped flag. + */ + if (buflen == 0) { + sc->sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; + sprintf(target_buf, DUMMY_TARGET); + } + + trace_xrep_symlink_salvage_target(sc->ip, target_buf, + strlen(target_buf)); + return 0; +} + +STATIC void +xrep_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp, + void *priv) +{ + struct xfs_scrub *sc = priv; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + xfs_symlink_local_to_remote(tp, bp, ip, ifp, NULL); + + if (!xfs_has_crc(sc->mp)) + return; + + dsl->sl_owner = cpu_to_be64(sc->ip->i_ino); + xfs_trans_log_buf(tp, bp, 0, + sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); +} + +/* + * Prepare both links' data forks for an exchange. Promote the tempfile from + * local format to extents format, and if the file being repaired has a short + * format data fork, turn it into an empty extent list. + */ +STATIC int +xrep_symlink_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the temp link is in shortform format, convert that to a remote + * target so that we can use the atomic mapping exchange. + */ + if (temp_local) { + int logflags = XFS_ILOG_CORE; + + error = xfs_bmap_local_to_extents(sc->tp, sc->tempip, 1, + &logflags, XFS_DATA_FORK, + xrep_symlink_local_to_remote, + sc); + if (error) + return error; + + xfs_trans_log_inode(sc->tp, sc->ip, 0); + + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform data fork, convert that + * to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_DDATA); + } + + return 0; +} + +/* Exchange the temporary symlink's data fork with the one being repaired. */ +STATIC int +xrep_symlink_swap( + struct xfs_scrub *sc) +{ + struct xrep_tempexch *tx = sc->buf; + bool ip_local, temp_local; + int error; + + ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both links have a local format data fork and the rebuilt + * remote data would fit in the repaired file's data fork, copy the + * contents from the tempfile and declare ourselves done. + */ + if (ip_local && temp_local && + sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { + xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); + return 0; + } + + /* Otherwise, make sure both data forks are in block-mapping mode. */ + error = xrep_symlink_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + return xrep_tempexch_contents(sc, tx); +} + +/* + * Free all the remote blocks and reset the data fork. The caller must join + * the inode to the transaction. This function returns with the inode joined + * to a clean scrub transaction. + */ +STATIC int +xrep_symlink_reset_fork( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); + int error; + + /* Unmap all the remote target buffers. */ + if (xfs_ifork_has_extents(ifp)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); + if (error) + return error; + } + + trace_xrep_symlink_reset_fork(sc->tempip); + + /* Reset the temp symlink target to dummy content. */ + xfs_idestroy_fork(ifp); + return xfs_symlink_write_target(sc->tp, sc->tempip, sc->tempip->i_ino, + "?", 1, 0, 0); +} + +/* + * Reinitialize a link target. Caller must ensure the inode is joined to + * the transaction. + */ +STATIC int +xrep_symlink_rebuild( + struct xfs_scrub *sc) +{ + struct xrep_tempexch *tx; + char *target_buf = sc->buf; + xfs_fsblock_t fs_blocks; + unsigned int target_len; + unsigned int resblks; + int error; + + /* How many blocks do we need? */ + target_len = strlen(target_buf); + ASSERT(target_len != 0); + if (target_len == 0 || target_len > XFS_SYMLINK_MAXLEN) + return -EFSCORRUPTED; + + trace_xrep_symlink_rebuild(sc->ip); + + /* + * In preparation to write the new symlink target to the temporary + * file, drop the ILOCK of the file being repaired (it shouldn't be + * joined) and take the ILOCK of the temporary file. + * + * The VFS does not take the IOLOCK while reading a symlink (and new + * symlinks are hidden with INEW until they've been written) so it's + * possible that a readlink() could see the old corrupted contents + * while we're doing this. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* + * Reserve resources to reinitialize the target. We're allowed to + * exceed file quota to repair inconsistent metadata, though this is + * unlikely. + */ + fs_blocks = xfs_symlink_blocks(sc->mp, target_len); + resblks = xfs_symlink_space_res(sc->mp, target_len, fs_blocks); + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->tempip, resblks, 0, + true); + if (error) + return error; + + /* Erase the dummy target set up by the tempfile initialization. */ + xfs_idestroy_fork(&sc->tempip->i_df); + sc->tempip->i_df.if_bytes = 0; + sc->tempip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + + /* Write the salvaged target to the temporary link. */ + error = xfs_symlink_write_target(sc->tp, sc->tempip, sc->ip->i_ino, + target_buf, target_len, fs_blocks, resblks); + if (error) + return error; + + /* + * Commit the repair transaction so that we can use the atomic mapping + * exchange functions to compute the correct block reservations and + * re-lock the inodes. + */ + target_buf = NULL; + error = xrep_trans_commit(sc); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + xrep_tempfile_iunlock(sc); + + /* + * We're done with the temporary buffer, so we can reuse it for the + * tempfile contents exchange information. + */ + tx = sc->buf; + error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, tx); + if (error) + return error; + + /* + * Exchange the temp link's data fork with the file being repaired. + * This recreates the transaction and takes the ILOCKs of the file + * being repaired and the temporary file. + */ + error = xrep_symlink_swap(sc); + if (error) + return error; + + /* + * Release the old symlink blocks and reset the data fork of the temp + * link to an empty shortform link. This is the last repair action we + * perform on the symlink, so we don't need to clean the transaction. + */ + return xrep_symlink_reset_fork(sc); +} + +/* Repair a symbolic link. */ +int +xrep_symlink( + struct xfs_scrub *sc) +{ + int error; + + /* The rmapbt is required to reap the old data fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xrep_symlink_salvage(sc); + if (error) + return error; + + /* Now reset the target. */ + error = xrep_symlink_rebuild(sc); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h new file mode 100644 index 000000000000..995ba187c5aa --- /dev/null +++ b/fs/xfs/scrub/tempexch.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_TEMPEXCH_H__ +#define __XFS_SCRUB_TEMPEXCH_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +struct xrep_tempexch { + struct xfs_exchmaps_req req; +}; + +int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork, + struct xrep_tempexch *ti); +int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork, + struct xrep_tempexch *ti); + +int xrep_tempexch_contents(struct xfs_scrub *sc, struct xrep_tempexch *ti); +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_TEMPEXCH_H__ */ diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c new file mode 100644 index 000000000000..b747b625c5ee --- /dev/null +++ b/fs/xfs/scrub/tempfile.c @@ -0,0 +1,851 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_dir2.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_defer.h" +#include "xfs_symlink_remote.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" + +/* + * Create a temporary file for reconstructing metadata, with the intention of + * atomically exchanging the temporary file's contents with the file that's + * being repaired. + */ +int +xrep_tempfile_create( + struct xfs_scrub *sc, + uint16_t mode) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = NULL; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + struct xfs_inode *dp = mp->m_rootip; + xfs_ino_t ino; + unsigned int resblks; + bool is_dir = S_ISDIR(mode); + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + if (xfs_is_readonly(mp)) + return -EROFS; + + ASSERT(sc->tp == NULL); + ASSERT(sc->tempip == NULL); + + /* + * Make sure that we have allocated dquot(s) on disk. The temporary + * inode should be completely root owned so that we don't fail due to + * quota limits. + */ + error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + resblks = xfs_mkdir_space_res(mp, 0); + tres = &M_RES(mp)->tr_mkdir; + } else { + resblks = XFS_IALLOC_SPACE_RES(mp); + tres = &M_RES(mp)->tr_create_tmpfile; + } + + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); + if (error) + goto out_release_dquots; + + /* Allocate inode, set up directory. */ + error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); + if (error) + goto out_trans_cancel; + error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0, + 0, false, &sc->tempip); + if (error) + goto out_trans_cancel; + + /* Change the ownership of the inode to root. */ + VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID; + VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID; + sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); + xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE); + + /* + * Mark our temporary file as private so that LSMs and the ACL code + * don't try to add their own metadata or reason about these files. + * The file should never be exposed to userspace. + */ + VFS_I(sc->tempip)->i_flags |= S_PRIVATE; + VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR; + + if (is_dir) { + error = xfs_dir_init(tp, sc->tempip, dp); + if (error) + goto out_trans_cancel; + } else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) { + /* + * Initialize the temporary symlink with a meaningless target + * that won't trip the verifiers. Repair must rewrite the + * target with meaningful content before swapping with the file + * being repaired. A single-byte target will not write a + * remote target block, so the owner is irrelevant. + */ + error = xfs_symlink_write_target(tp, sc->tempip, + sc->tempip->i_ino, ".", 1, 0, 0); + if (error) + goto out_trans_cancel; + } + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp); + + /* + * Put our temp file on the unlinked list so it's purged automatically. + * All file-based metadata being reconstructed using this file must be + * atomically exchanged with the original file because the contents + * here will be purged when the inode is dropped or log recovery cleans + * out the unlinked list. + */ + error = xfs_iunlink(tp, sc->tempip); + if (error) + goto out_trans_cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out_release_inode; + + trace_xrep_tempfile_create(sc); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + /* Finish setting up the incore / vfs context. */ + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + xfs_setup_iops(sc->tempip); + xfs_finish_inode_setup(sc->tempip); + + sc->temp_ilock_flags = 0; + return error; + +out_trans_cancel: + xfs_trans_cancel(tp); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (sc->tempip) { + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + xfs_finish_inode_setup(sc->tempip); + xchk_irele(sc, sc->tempip); + } +out_release_dquots: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +/* Take IOLOCK_EXCL on the temporary file, maybe. */ +bool +xrep_tempfile_iolock_nowait( + struct xfs_scrub *sc) +{ + if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) { + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + return true; + } + + return false; +} + +/* + * Take the temporary file's IOLOCK while holding a different inode's IOLOCK. + * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock + * to avoid deadlocks and lockdep complaints. + */ +int +xrep_tempfile_iolock_polled( + struct xfs_scrub *sc) +{ + int error = 0; + + while (!xrep_tempfile_iolock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + return 0; +} + +/* Release IOLOCK_EXCL on the temporary file. */ +void +xrep_tempfile_iounlock( + struct xfs_scrub *sc) +{ + xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL; +} + +/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */ +void +xrep_tempfile_ilock( + struct xfs_scrub *sc) +{ + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->tempip, XFS_ILOCK_EXCL); +} + +/* Try to grab ILOCK_EXCL on the temporary file. */ +bool +xrep_tempfile_ilock_nowait( + struct xfs_scrub *sc) +{ + if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) { + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + return true; + } + + return false; +} + +/* Unlock ILOCK_EXCL on the temporary file after an update. */ +void +xrep_tempfile_iunlock( + struct xfs_scrub *sc) +{ + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL; +} + +/* + * Begin the process of making changes to both the file being scrubbed and + * the temporary file by taking ILOCK_EXCL on both. + */ +void +xrep_tempfile_ilock_both( + struct xfs_scrub *sc) +{ + xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL); + sc->ilock_flags |= XFS_ILOCK_EXCL; + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; +} + +/* Unlock ILOCK_EXCL on both files. */ +void +xrep_tempfile_iunlock_both( + struct xfs_scrub *sc) +{ + xrep_tempfile_iunlock(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); +} + +/* Release the temporary file. */ +void +xrep_tempfile_rele( + struct xfs_scrub *sc) +{ + if (!sc->tempip) + return; + + if (sc->temp_ilock_flags) { + xfs_iunlock(sc->tempip, sc->temp_ilock_flags); + sc->temp_ilock_flags = 0; + } + + xchk_irele(sc, sc->tempip); + sc->tempip = NULL; +} + +/* + * Make sure that the given range of the data fork of the temporary file is + * mapped to written blocks. The caller must ensure that both inodes are + * joined to the transaction. + */ +int +xrep_tempfile_prealloc( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t end = off + len; + int error; + + ASSERT(sc->tempip != NULL); + ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip)); + + for (; off < end; off = map.br_startoff + map.br_blockcount) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + if (xfs_bmap_is_written_extent(&map)) + continue; + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* + * Make sure this block has a real zeroed extent allocated to + * it. + */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map, + &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map); + + /* Commit new extent and all deferred work. */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + return 0; +} + +/* + * Write data to each block of a file. The given range of the tempfile's data + * fork must already be populated with written extents. + */ +int +xrep_tempfile_copyin( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len, + xrep_tempfile_copyin_fn prep_fn, + void *data) +{ + LIST_HEAD(buffers_list); + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + xfs_fileoff_t flush_mask; + xfs_fileoff_t end = off + len; + loff_t pos = XFS_FSB_TO_B(mp, off); + int error = 0; + + ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode)); + + /* Flush buffers to disk every 512K */ + flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1; + + for (; off < end; off++, pos += mp->m_sb.sb_blocksize) { + struct xfs_bmbt_irec map; + int nmaps = 1; + + /* Read block mapping for this file block. */ + error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0); + if (error) + goto out_err; + if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) { + error = -EFSCORRUPTED; + goto out_err; + } + + /* Get the metadata buffer for this offset in the file. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp); + if (error) + goto out_err; + + trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map); + + /* Read in a block's worth of data from the xfile. */ + error = prep_fn(sc, bp, data); + if (error) { + xfs_trans_brelse(sc->tp, bp); + goto out_err; + } + + /* Queue buffer, and flush if we have too much dirty data. */ + xfs_buf_delwri_queue_here(bp, &buffers_list); + xfs_trans_brelse(sc->tp, bp); + + if (!(off & flush_mask)) { + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + } + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + error = -EIO; + goto out_err; + } + + return 0; + +out_err: + xfs_buf_delwri_cancel(&buffers_list); + return error; +} + +/* + * Set the temporary file's size. Caller must join the tempfile to the scrub + * transaction and is responsible for adjusting block mappings as needed. + */ +int +xrep_tempfile_set_isize( + struct xfs_scrub *sc, + unsigned long long isize) +{ + if (sc->tempip->i_disk_size == isize) + return 0; + + sc->tempip->i_disk_size = isize; + i_size_write(VFS_I(sc->tempip), isize); + return xrep_tempfile_roll_trans(sc); +} + +/* + * Roll a repair transaction involving the temporary file. Caller must join + * both the temporary file and the file being scrubbed to the transaction. + * This function return with both inodes joined to a new scrub transaction, + * or the usual negative errno. + */ +int +xrep_tempfile_roll_trans( + struct xfs_scrub *sc) +{ + int error; + + xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE); + error = xrep_roll_trans(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + return 0; +} + +/* + * Fill out the mapping exchange request in preparation for atomically + * committing the contents of a metadata file that we've rebuilt in the temp + * file. + */ +STATIC int +xrep_tempexch_prep_request( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + struct xfs_exchmaps_req *req = &tx->req; + + memset(tx, 0, sizeof(struct xrep_tempexch)); + + /* COW forks don't exist on disk. */ + if (whichfork == XFS_COW_FORK) { + ASSERT(0); + return -EINVAL; + } + + /* Both files should have the relevant forks. */ + if (!xfs_ifork_ptr(sc->ip, whichfork) || + !xfs_ifork_ptr(sc->tempip, whichfork)) { + ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); + ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL); + return -EINVAL; + } + + /* Exchange all mappings in both forks. */ + req->ip1 = sc->tempip; + req->ip2 = sc->ip; + req->startoff1 = 0; + req->startoff2 = 0; + switch (whichfork) { + case XFS_ATTR_FORK: + req->flags |= XFS_EXCHMAPS_ATTR_FORK; + break; + case XFS_DATA_FORK: + /* Always exchange sizes when exchanging data fork mappings. */ + req->flags |= XFS_EXCHMAPS_SET_SIZES; + break; + } + req->blockcount = XFS_MAX_FILEOFF; + + return 0; +} + +/* + * Fill out the mapping exchange resource estimation structures in preparation + * for exchanging the contents of a metadata file that we've rebuilt in the + * temp file. Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files. + */ +STATIC int +xrep_tempexch_estimate( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + struct xfs_exchmaps_req *req = &tx->req; + struct xfs_ifork *ifp; + struct xfs_ifork *tifp; + int whichfork = xfs_exchmaps_reqfork(req); + int state = 0; + + /* + * The exchmaps code only knows how to exchange file fork space + * mappings. Any fork data in local format must be promoted to a + * single block before the exchange can take place. + */ + ifp = xfs_ifork_ptr(sc->ip, whichfork); + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) + state |= 1; + + tifp = xfs_ifork_ptr(sc->tempip, whichfork); + if (tifp->if_format == XFS_DINODE_FMT_LOCAL) + state |= 2; + + switch (state) { + case 0: + /* Both files have mapped extents; use the regular estimate. */ + return xfs_exchrange_estimate(req); + case 1: + /* + * The file being repaired is in local format, but the temp + * file has mapped extents. To perform the exchange, the file + * being repaired must have its shorform data converted to an + * ondisk block so that the forks will be in extents format. + * We need one resblk for the conversion; the number of + * exchanges is (worst case) the temporary file's extent count + * plus the block we converted. + */ + req->ip1_bcount = sc->tempip->i_nblocks; + req->ip2_bcount = 1; + req->nr_exchanges = 1 + tifp->if_nextents; + req->resblks = 1; + break; + case 2: + /* + * The temporary file is in local format, but the file being + * repaired has mapped extents. To perform the exchange, the + * temp file must have its shortform data converted to an + * ondisk block, and the fork changed to extents format. We + * need one resblk for the conversion; the number of exchanges + * is (worst case) the extent count of the file being repaired + * plus the block we converted. + */ + req->ip1_bcount = 1; + req->ip2_bcount = sc->ip->i_nblocks; + req->nr_exchanges = 1 + ifp->if_nextents; + req->resblks = 1; + break; + case 3: + /* + * Both forks are in local format. To perform the exchange, + * both files must have their shortform data converted to + * fsblocks, and both forks must be converted to extents + * format. We need two resblks for the two conversions, and + * the number of exchanges is 1 since there's only one block at + * fileoff 0. Presumably, the caller could not exchange the + * two inode fork areas directly. + */ + req->ip1_bcount = 1; + req->ip2_bcount = 1; + req->nr_exchanges = 1; + req->resblks = 2; + break; + } + + return xfs_exchmaps_estimate_overhead(req); +} + +/* + * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip + * this if quota enforcement is disabled or if both inodes' dquots are the + * same. The qretry structure must be initialized to zeroes before the first + * call to this function. + */ +STATIC int +xrep_tempexch_reserve_quota( + struct xfs_scrub *sc, + const struct xrep_tempexch *tx) +{ + struct xfs_trans *tp = sc->tp; + const struct xfs_exchmaps_req *req = &tx->req; + int64_t ddelta, rdelta; + int error; + + /* + * Don't bother with a quota reservation if we're not enforcing them + * or the two inodes have the same dquots. + */ + if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || + (req->ip1->i_udquot == req->ip2->i_udquot && + req->ip1->i_gdquot == req->ip2->i_gdquot && + req->ip1->i_pdquot == req->ip2->i_pdquot)) + return 0; + + /* + * Quota reservation for each file comes from two sources. First, we + * need to account for any net gain in mapped blocks during the + * exchange. Second, we need reservation for the gross gain in mapped + * blocks so that we don't trip over any quota block reservation + * assertions. We must reserve the gross gain because the quota code + * subtracts from bcount the number of blocks that we unmap; it does + * not add that quantity back to the quota block reservation. + */ + ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount); + rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount); + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, + ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount, + true); + if (error) + return error; + + ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount); + rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount); + return xfs_trans_reserve_quota_nblks(tp, req->ip2, + ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount, + true); +} + +/* + * Prepare an existing transaction for an atomic file contents exchange. + * + * This function fills out the mapping exchange request and resource estimation + * structures in preparation for exchanging the contents of a metadata file + * that has been rebuilt in the temp file. Next, it reserves space and quota + * for the transaction. + * + * The caller must hold ILOCK_EXCL of the scrub target file and the temporary + * file. The caller must join both inodes to the transaction with no unlock + * flags, and is responsible for dropping both ILOCKs when appropriate. Only + * use this when those ILOCKs cannot be dropped. + */ +int +xrep_tempexch_trans_reserve( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + int error; + + ASSERT(sc->tp != NULL); + xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL); + + error = xrep_tempexch_prep_request(sc, whichfork, tx); + if (error) + return error; + + error = xfs_exchmaps_estimate(&tx->req); + if (error) + return error; + + error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0); + if (error) + return error; + + return xrep_tempexch_reserve_quota(sc, tx); +} + +/* + * Create a new transaction for a file contents exchange. + * + * This function fills out the mapping excahange request and resource + * estimation structures in preparation for exchanging the contents of a + * metadata file that has been rebuilt in the temp file. Next, it reserves + * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and + * reserves quota for the transaction. + * + * The caller is responsible for dropping both ILOCKs when appropriate. + */ +int +xrep_tempexch_trans_alloc( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + unsigned int flags = 0; + int error; + + ASSERT(sc->tp == NULL); + ASSERT(xfs_has_exchange_range(sc->mp)); + + error = xrep_tempexch_prep_request(sc, whichfork, tx); + if (error) + return error; + + error = xrep_tempexch_estimate(sc, tx); + if (error) + return error; + + if (xfs_has_lazysbcount(sc->mp)) + flags |= XFS_TRANS_RES_FDBLKS; + + error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, + tx->req.resblks, 0, flags, &sc->tp); + if (error) + return error; + + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip); + + return xrep_tempexch_reserve_quota(sc, tx); +} + +/* + * Exchange file mappings (and hence file contents) between the file being + * repaired and the temporary file. Returns with both inodes locked and joined + * to a clean scrub transaction. + */ +int +xrep_tempexch_contents( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + int error; + + ASSERT(xfs_has_exchange_range(sc->mp)); + + xfs_exchange_mappings(sc->tp, &tx->req); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* + * If we exchanged the ondisk sizes of two metadata files, we must + * exchanged the incore sizes as well. + */ + if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) { + loff_t temp; + + temp = i_size_read(VFS_I(sc->ip)); + i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); + i_size_write(VFS_I(sc->tempip), temp); + } + + return 0; +} + +/* + * Write local format data from one of the temporary file's forks into the same + * fork of file being repaired, and exchange the file sizes, if appropriate. + * Caller must ensure that the file being repaired has enough fork space to + * hold all the bytes. + */ +void +xrep_tempfile_copyout_local( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *temp_ifp; + struct xfs_ifork *ifp; + unsigned int ilog_flags = XFS_ILOG_CORE; + + temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork); + ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(temp_ifp != NULL); + ASSERT(ifp != NULL); + ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + + switch (whichfork) { + case XFS_DATA_FORK: + ASSERT(sc->tempip->i_disk_size <= + xfs_inode_data_fork_size(sc->ip)); + break; + case XFS_ATTR_FORK: + ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff); + break; + default: + ASSERT(0); + return; + } + + /* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */ + xfs_idestroy_fork(ifp); + xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data, + temp_ifp->if_bytes); + + if (whichfork == XFS_DATA_FORK) { + i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); + sc->ip->i_disk_size = sc->tempip->i_disk_size; + } + + ilog_flags |= xfs_ilog_fdata(whichfork); + xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags); +} + +/* Decide if a given XFS inode is a temporary file for a repair. */ +bool +xrep_is_tempfile( + const struct xfs_inode *ip) +{ + const struct inode *inode = &ip->i_vnode; + + if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR)) + return true; + + return false; +} diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h new file mode 100644 index 000000000000..e51399f595fe --- /dev/null +++ b/fs/xfs/scrub/tempfile.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_TEMPFILE_H__ +#define __XFS_SCRUB_TEMPFILE_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode); +void xrep_tempfile_rele(struct xfs_scrub *sc); + +bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc); +int xrep_tempfile_iolock_polled(struct xfs_scrub *sc); +void xrep_tempfile_iounlock(struct xfs_scrub *sc); + +void xrep_tempfile_ilock(struct xfs_scrub *sc); +bool xrep_tempfile_ilock_nowait(struct xfs_scrub *sc); +void xrep_tempfile_iunlock(struct xfs_scrub *sc); +void xrep_tempfile_iunlock_both(struct xfs_scrub *sc); +void xrep_tempfile_ilock_both(struct xfs_scrub *sc); + +int xrep_tempfile_prealloc(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len); + +enum xfs_blft; + +typedef int (*xrep_tempfile_copyin_fn)(struct xfs_scrub *sc, + struct xfs_buf *bp, void *data); + +int xrep_tempfile_copyin(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len, xrep_tempfile_copyin_fn fn, void *data); + +int xrep_tempfile_set_isize(struct xfs_scrub *sc, unsigned long long isize); + +int xrep_tempfile_roll_trans(struct xfs_scrub *sc); +void xrep_tempfile_copyout_local(struct xfs_scrub *sc, int whichfork); +bool xrep_is_tempfile(const struct xfs_inode *ip); +#else +static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc) +{ + xchk_ilock(sc, XFS_IOLOCK_EXCL); +} +# define xrep_is_tempfile(ip) (false) +# define xrep_tempfile_rele(sc) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_TEMPFILE_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 3dd281d6d185..4470ad0533b8 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -19,13 +19,19 @@ #include "xfs_da_format.h" #include "xfs_dir2.h" #include "xfs_rmap.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/quota.h" #include "scrub/iscan.h" +#include "scrub/orphanage.h" #include "scrub/nlinks.h" #include "scrub/fscounters.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfblob.h" +#include "scrub/dirtree.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 5b294be52c55..92ef4cdc486e 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -26,6 +26,10 @@ struct xchk_iscan; struct xchk_nlink; struct xchk_fscounters; struct xfs_rmap_update_params; +struct xfs_parent_rec; +enum xchk_dirpath_outcome; +struct xchk_dirtree; +struct xchk_dirtree_outcomes; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -64,6 +68,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -93,7 +99,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \ { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ - { XFS_SCRUB_TYPE_HEALTHY, "healthy" } + { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \ + { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \ + { XFS_SCRUB_TYPE_BARRIER, "barrier" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -169,6 +177,8 @@ DEFINE_EVENT(xchk_class, name, \ DEFINE_SCRUB_EVENT(xchk_start); DEFINE_SCRUB_EVENT(xchk_done); DEFINE_SCRUB_EVENT(xchk_deadlock_retry); +DEFINE_SCRUB_EVENT(xchk_dirtree_start); +DEFINE_SCRUB_EVENT(xchk_dirtree_done); DEFINE_SCRUB_EVENT(xrep_attempt); DEFINE_SCRUB_EVENT(xrep_done); @@ -199,6 +209,81 @@ DEFINE_EVENT(xchk_fsgate_class, name, \ DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable); DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable); +DECLARE_EVENT_CLASS(xchk_vector_head_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), + TP_ARGS(ip, vhead), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(unsigned short, rest_us) + __field(unsigned short, nr_vecs) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = vhead->svh_agno; + __entry->inum = vhead->svh_ino; + __entry->gen = vhead->svh_gen; + __entry->flags = vhead->svh_flags; + __entry->rest_us = vhead->svh_rest_us; + __entry->nr_vecs = vhead->svh_nr; + ), + TP_printk("dev %d:%d ino 0x%llx agno 0x%x inum 0x%llx gen 0x%x flags 0x%x rest_us %u nr_vecs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->inum, + __entry->gen, + __entry->flags, + __entry->rest_us, + __entry->nr_vecs) +) +#define DEFINE_SCRUBV_HEAD_EVENT(name) \ +DEFINE_EVENT(xchk_vector_head_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), \ + TP_ARGS(ip, vhead)) + +DEFINE_SCRUBV_HEAD_EVENT(xchk_scrubv_start); + +DECLARE_EVENT_CLASS(xchk_vector_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, + unsigned int vec_nr, struct xfs_scrub_vec *v), + TP_ARGS(mp, vhead, vec_nr, v), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, vec_nr) + __field(unsigned int, vec_type) + __field(unsigned int, vec_flags) + __field(int, vec_ret) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->vec_nr = vec_nr; + __entry->vec_type = v->sv_type; + __entry->vec_flags = v->sv_flags; + __entry->vec_ret = v->sv_ret; + ), + TP_printk("dev %d:%d vec[%u] type %s flags %s ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->vec_nr, + __print_symbolic(__entry->vec_type, XFS_SCRUB_TYPE_STRINGS), + __print_flags(__entry->vec_flags, "|", XFS_SCRUB_FLAG_STRINGS), + __entry->vec_ret) +) +#define DEFINE_SCRUBV_EVENT(name) \ +DEFINE_EVENT(xchk_vector_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, \ + unsigned int vec_nr, struct xfs_scrub_vec *v), \ + TP_ARGS(mp, vhead, vec_nr, v)) + +DEFINE_SCRUBV_EVENT(xchk_scrubv_barrier_fail); +DEFINE_SCRUBV_EVENT(xchk_scrubv_item); +DEFINE_SCRUBV_EVENT(xchk_scrubv_outcome); + TRACE_EVENT(xchk_op_error, TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int error, void *ret_ip), @@ -364,6 +449,7 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \ DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_preen); #ifdef CONFIG_XFS_QUOTA DECLARE_EVENT_CLASS(xchk_dqiter_class, @@ -475,7 +561,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -518,7 +604,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->ptr = cur->bc_levels[level].ptr; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); @@ -558,7 +644,7 @@ TRACE_EVENT(xchk_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -598,7 +684,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -637,7 +723,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->level = level; @@ -947,6 +1033,7 @@ DEFINE_XFILE_EVENT(xfile_store); DEFINE_XFILE_EVENT(xfile_seek_data); DEFINE_XFILE_EVENT(xfile_get_folio); DEFINE_XFILE_EVENT(xfile_put_folio); +DEFINE_XFILE_EVENT(xfile_discard); TRACE_EVENT(xfarray_create, TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), @@ -1300,7 +1387,7 @@ TRACE_EVENT(xchk_iscan_iget_batch, __entry->unavail) ); -TRACE_EVENT(xchk_iscan_iget_retry_wait, +DECLARE_EVENT_CLASS(xchk_iscan_retry_wait_class, TP_PROTO(struct xchk_iscan *iscan), TP_ARGS(iscan), TP_STRUCT__entry( @@ -1326,7 +1413,13 @@ TRACE_EVENT(xchk_iscan_iget_retry_wait, __entry->remaining, __entry->iget_timeout, __entry->retry_delay) -); +) +#define DEFINE_ISCAN_RETRY_WAIT_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_retry_wait_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_iget_retry_wait); +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_agi_retry_wait); TRACE_EVENT(xchk_nlinks_collect_dirent, TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, @@ -1354,6 +1447,33 @@ TRACE_EVENT(xchk_nlinks_collect_dirent, __get_str(name)) ); +TRACE_EVENT(xchk_nlinks_collect_pptr, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(mp, dp, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->ino = be64_to_cpu(pptr->p_ino); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + TRACE_EVENT(xchk_nlinks_collect_metafile, TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), TP_ARGS(mp, ino), @@ -1502,6 +1622,300 @@ DEFINE_EVENT(xchk_nlinks_diff_class, name, \ TP_ARGS(mp, ip, live)) DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode); +DECLARE_EVENT_CLASS(xchk_pptr_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, + xfs_ino_t far_ino), + TP_ARGS(ip, name, far_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + __field(xfs_ino_t, far_ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name, name->len); + __entry->far_ino = far_ino; + ), + TP_printk("dev %d:%d ino 0x%llx name '%.*s' far_ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __get_str(name), + __entry->far_ino) +) +#define DEFINE_XCHK_PPTR_EVENT(name) \ +DEFINE_EVENT(xchk_pptr_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \ + xfs_ino_t far_ino), \ + TP_ARGS(ip, name, far_ino)) +DEFINE_XCHK_PPTR_EVENT(xchk_dir_defer); +DEFINE_XCHK_PPTR_EVENT(xchk_dir_slowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_dir_ultraslowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_defer); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_slowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_ultraslowpath); + +DECLARE_EVENT_CLASS(xchk_dirtree_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int path_nr, const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(sc, ip, path_nr, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(xfs_ino_t, child_ino) + __field(unsigned int, child_gen) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->child_ino = ip->i_ino; + __entry->child_gen = VFS_I(ip)->i_generation; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d path %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->child_ino, + __entry->child_gen, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_XCHK_DIRTREE_EVENT(name) \ +DEFINE_EVENT(xchk_dirtree_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \ + unsigned int path_nr, const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(sc, ip, path_nr, name, pptr)) +DEFINE_XCHK_DIRTREE_EVENT(xchk_dirtree_create_path); +DEFINE_XCHK_DIRTREE_EVENT(xchk_dirpath_walk_upwards); + +DECLARE_EVENT_CLASS(xchk_dirpath_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int path_nr, unsigned int step_nr, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(sc, ip, path_nr, step_nr, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(unsigned int, step_nr) + __field(xfs_ino_t, child_ino) + __field(unsigned int, child_gen) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->step_nr = step_nr; + __entry->child_ino = ip->i_ino; + __entry->child_gen = VFS_I(ip)->i_generation; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d path %u step %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->step_nr, + __entry->child_ino, + __entry->child_gen, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_XCHK_DIRPATH_EVENT(name) \ +DEFINE_EVENT(xchk_dirpath_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \ + unsigned int path_nr, unsigned int step_nr, \ + const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(sc, ip, path_nr, step_nr, name, pptr)) +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_disappeared); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step); + +TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_CORRUPT); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_LOOP); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_STALE); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_OK); +TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETING); +TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETED); +TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTING); +TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTED); + +#define XCHK_DIRPATH_OUTCOME_STRINGS \ + { XCHK_DIRPATH_SCANNING, "scanning" }, \ + { XCHK_DIRPATH_DELETE, "delete" }, \ + { XCHK_DIRPATH_CORRUPT, "corrupt" }, \ + { XCHK_DIRPATH_LOOP, "loop" }, \ + { XCHK_DIRPATH_STALE, "stale" }, \ + { XCHK_DIRPATH_OK, "ok" }, \ + { XREP_DIRPATH_DELETING, "deleting" }, \ + { XREP_DIRPATH_DELETED, "deleted" }, \ + { XREP_DIRPATH_ADOPTING, "adopting" }, \ + { XREP_DIRPATH_ADOPTED, "adopted" } + +DECLARE_EVENT_CLASS(xchk_dirpath_outcome_class, + TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, + unsigned int nr_steps, \ + unsigned int outcome), + TP_ARGS(sc, path_nr, nr_steps, outcome), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, path_nr) + __field(unsigned int, nr_steps) + __field(unsigned int, outcome) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->nr_steps = nr_steps; + __entry->outcome = outcome; + ), + TP_printk("dev %d:%d path %llu steps %u outcome %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->nr_steps, + __print_symbolic(__entry->outcome, XCHK_DIRPATH_OUTCOME_STRINGS)) +); +#define DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(name) \ +DEFINE_EVENT(xchk_dirpath_outcome_class, name, \ + TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, \ + unsigned int nr_steps, \ + unsigned int outcome), \ + TP_ARGS(sc, path_nr, nr_steps, outcome)) +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_set_outcome); +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_evaluate_path); + +DECLARE_EVENT_CLASS(xchk_dirtree_evaluate_class, + TP_PROTO(const struct xchk_dirtree *dl, + const struct xchk_dirtree_outcomes *oc), + TP_ARGS(dl, oc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, rootino) + __field(unsigned int, nr_paths) + __field(unsigned int, bad) + __field(unsigned int, suspect) + __field(unsigned int, good) + __field(bool, needs_adoption) + ), + TP_fast_assign( + __entry->dev = dl->sc->mp->m_super->s_dev; + __entry->ino = dl->sc->ip->i_ino; + __entry->rootino = dl->root_ino; + __entry->nr_paths = dl->nr_paths; + __entry->bad = oc->bad; + __entry->suspect = oc->suspect; + __entry->good = oc->good; + __entry->needs_adoption = oc->needs_adoption ? 1 : 0; + ), + TP_printk("dev %d:%d ino 0x%llx rootino 0x%llx nr_paths %u bad %u suspect %u good %u adopt? %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->rootino, + __entry->nr_paths, + __entry->bad, + __entry->suspect, + __entry->good, + __entry->needs_adoption) +); +#define DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(name) \ +DEFINE_EVENT(xchk_dirtree_evaluate_class, name, \ + TP_PROTO(const struct xchk_dirtree *dl, \ + const struct xchk_dirtree_outcomes *oc), \ + TP_ARGS(dl, oc)) +DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xchk_dirtree_evaluate); + +TRACE_EVENT(xchk_dirpath_changed, + TP_PROTO(struct xfs_scrub *sc, unsigned int path_nr, + unsigned int step_nr, const struct xfs_inode *dp, + const struct xfs_inode *ip, const struct xfs_name *xname), + TP_ARGS(sc, path_nr, step_nr, dp, ip, xname), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(unsigned int, step_nr) + __field(xfs_ino_t, child_ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, xname->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->step_nr = step_nr; + __entry->child_ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->namelen = xname->len; + memcpy(__get_str(name), xname->name, xname->len); + ), + TP_printk("dev %d:%d path %u step %u child_ino 0x%llx parent_ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->step_nr, + __entry->child_ino, + __entry->parent_ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_dirtree_live_update, + TP_PROTO(struct xfs_scrub *sc, const struct xfs_inode *dp, + int action, const struct xfs_inode *ip, int delta, + const struct xfs_name *xname), + TP_ARGS(sc, dp, action, ip, delta, xname), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, parent_ino) + __field(int, action) + __field(xfs_ino_t, child_ino) + __field(int, delta) + __field(unsigned int, namelen) + __dynamic_array(char, name, xname->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->parent_ino = dp->i_ino; + __entry->action = action; + __entry->child_ino = ip->i_ino; + __entry->delta = delta; + __entry->namelen = xname->len; + memcpy(__get_str(name), xname->name, xname->len); + ), + TP_printk("dev %d:%d parent_ino 0x%llx child_ino 0x%llx nlink_delta %d name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->parent_ino, + __entry->child_ino, + __entry->delta, + __entry->namelen, + __get_str(name)) +); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) @@ -1533,6 +1947,7 @@ DEFINE_EVENT(xrep_extent_class, name, \ DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); +DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); DECLARE_EVENT_CLASS(xrep_reap_find_class, @@ -1566,6 +1981,7 @@ DEFINE_EVENT(xrep_reap_find_class, name, \ bool crosslinked), \ TP_ARGS(pag, agbno, len, crosslinked)) DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); +DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select); DECLARE_EVENT_CLASS(xrep_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -2273,6 +2689,891 @@ TRACE_EVENT(xrep_rmap_live_update, __entry->flags) ); +TRACE_EVENT(xrep_tempfile_create, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(xfs_ino_t, temp_inum) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->file ? XFS_I(file_inode(sc->file))->i_ino : 0; + __entry->type = sc->sm->sm_type; + __entry->agno = sc->sm->sm_agno; + __entry->inum = sc->sm->sm_ino; + __entry->gen = sc->sm->sm_gen; + __entry->flags = sc->sm->sm_flags; + __entry->temp_inum = sc->tempip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx type %s inum 0x%llx gen 0x%x flags 0x%x temp_inum 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->inum, + __entry->gen, + __entry->flags, + __entry->temp_inum) +); + +DECLARE_EVENT_CLASS(xrep_tempfile_class, + TP_PROTO(struct xfs_scrub *sc, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(sc, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->tempip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); +#define DEFINE_XREP_TEMPFILE_EVENT(name) \ +DEFINE_EVENT(xrep_tempfile_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int whichfork, \ + struct xfs_bmbt_irec *irec), \ + TP_ARGS(sc, whichfork, irec)) +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_prealloc); +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_copyin); + +TRACE_EVENT(xreap_ifork_extent, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork, + const struct xfs_bmbt_irec *irec), + TP_ARGS(sc, ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, fileoff) + __field(xfs_filblks_t, len) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->fileoff = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock); + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ip 0x%llx whichfork %s agno 0x%x agbno 0x%x fileoff 0x%llx fsbcount 0x%llx state 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->agno, + __entry->agbno, + __entry->fileoff, + __entry->len, + __entry->state) +); + +TRACE_EVENT(xreap_bmapi_binval_scan, + TP_PROTO(struct xfs_scrub *sc, const struct xfs_bmbt_irec *irec, + xfs_extlen_t scan_blocks), + TP_ARGS(sc, irec, scan_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_filblks_t, len) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, scan_blocks) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->len = irec->br_blockcount; + __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock); + __entry->scan_blocks = scan_blocks; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%llx scan_blocks 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->scan_blocks) +); + +TRACE_EVENT(xrep_xattr_recover_leafblock, + TP_PROTO(struct xfs_inode *ip, xfs_dablk_t dabno, uint16_t magic), + TP_ARGS(ip, dabno, magic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_dablk_t, dabno) + __field(uint16_t, magic) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->dabno = dabno; + __entry->magic = magic; + ), + TP_printk("dev %d:%d ino 0x%llx dablk 0x%x magic 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->dabno, + __entry->magic) +); + +DECLARE_EVENT_CLASS(xrep_xattr_salvage_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, + unsigned int namelen, unsigned int valuelen), + TP_ARGS(ip, flags, name, namelen, valuelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, flags) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + __field(unsigned int, valuelen) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->flags = flags; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + __entry->valuelen = valuelen; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s name '%.*s' valuelen 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->flags, "|", XFS_ATTR_NAMESPACE_STR), + __entry->namelen, + __get_str(name), + __entry->valuelen) +); +#define DEFINE_XREP_XATTR_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_salvage_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, \ + unsigned int namelen, unsigned int valuelen), \ + TP_ARGS(ip, flags, name, namelen, valuelen)) +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_salvage_rec); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_insert_rec); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_stash_xattr); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_insert_xattr); + +DECLARE_EVENT_CLASS(xrep_pptr_salvage_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, + unsigned int namelen, const void *value, unsigned int valuelen), + TP_ARGS(ip, flags, name, namelen, value, valuelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + const struct xfs_parent_rec *rec = value; + + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = be64_to_cpu(rec->p_ino); + __entry->parent_gen = be32_to_cpu(rec->p_gen); + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_salvage_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, \ + unsigned int namelen, const void *value, unsigned int valuelen), \ + TP_ARGS(ip, flags, name, namelen, value, valuelen)) +DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr); +DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr); + +TRACE_EVENT(xrep_xattr_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), + TP_ARGS(ip, arg_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, src_ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->src_ino = arg_ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx src 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->src_ino) +) +#define DEFINE_XREP_XATTR_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), \ + TP_ARGS(ip, arg_ip)) +DEFINE_XREP_XATTR_EVENT(xrep_xattr_rebuild_tree); +DEFINE_XREP_XATTR_EVENT(xrep_xattr_reset_fork); +DEFINE_XREP_XATTR_EVENT(xrep_xattr_full_reset); + +DECLARE_EVENT_CLASS(xrep_xattr_pptr_scan_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, + const struct xfs_name *name), + TP_ARGS(ip, dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->parent_gen = VFS_IC(dp)->i_generation; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_pptr_scan_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \ + const struct xfs_name *name), \ + TP_ARGS(ip, dp, name)) +DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentadd); +DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentremove); + +TRACE_EVENT(xrep_dir_recover_dirblock, + TP_PROTO(struct xfs_inode *dp, xfs_dablk_t dabno, uint32_t magic, + uint32_t magic_guess), + TP_ARGS(dp, dabno, magic, magic_guess), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_dablk_t, dabno) + __field(uint32_t, magic) + __field(uint32_t, magic_guess) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->dabno = dabno; + __entry->magic = magic; + __entry->magic_guess = magic_guess; + ), + TP_printk("dev %d:%d dir 0x%llx dablk 0x%x magic 0x%x magic_guess 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->dabno, + __entry->magic, + __entry->magic_guess) +); + +DECLARE_EVENT_CLASS(xrep_dir_class, + TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), + TP_ARGS(dp, parent_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, parent_ino) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->parent_ino = parent_ino; + ), + TP_printk("dev %d:%d dir 0x%llx parent 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->parent_ino) +) +#define DEFINE_XREP_DIR_EVENT(name) \ +DEFINE_EVENT(xrep_dir_class, name, \ + TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), \ + TP_ARGS(dp, parent_ino)) +DEFINE_XREP_DIR_EVENT(xrep_dir_rebuild_tree); +DEFINE_XREP_DIR_EVENT(xrep_dir_reset_fork); +DEFINE_XREP_DIR_EVENT(xrep_parent_reset_dotdot); + +DECLARE_EVENT_CLASS(xrep_dirent_class, + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, + xfs_ino_t ino), + TP_ARGS(dp, name, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + __field(xfs_ino_t, ino) + __field(uint8_t, ftype) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + __entry->ino = ino; + __entry->ftype = name->type; + ), + TP_printk("dev %d:%d dir 0x%llx ftype %s name '%.*s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->namelen, + __get_str(name), + __entry->ino) +) +#define DEFINE_XREP_DIRENT_EVENT(name) \ +DEFINE_EVENT(xrep_dirent_class, name, \ + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, \ + xfs_ino_t ino), \ + TP_ARGS(dp, name, ino)) +DEFINE_XREP_DIRENT_EVENT(xrep_dir_salvage_entry); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_createname); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_createname); +DEFINE_XREP_DIRENT_EVENT(xrep_adoption_reparent); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_removename); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_removename); + +DECLARE_EVENT_CLASS(xrep_adoption_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), + TP_ARGS(dp, ip, moved), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, child_ino) + __field(bool, moved) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->child_ino = ip->i_ino; + __entry->moved = moved; + ), + TP_printk("dev %d:%d dir 0x%llx child 0x%llx moved? %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->child_ino, + __entry->moved) +); +#define DEFINE_XREP_ADOPTION_EVENT(name) \ +DEFINE_EVENT(xrep_adoption_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), \ + TP_ARGS(dp, ip, moved)) +DEFINE_XREP_ADOPTION_EVENT(xrep_adoption_trans_roll); + +DECLARE_EVENT_CLASS(xrep_parent_salvage_class, + TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), + TP_ARGS(dp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->ino = ino; + ), + TP_printk("dev %d:%d dir 0x%llx parent 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->ino) +) +#define DEFINE_XREP_PARENT_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_parent_salvage_class, name, \ + TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), \ + TP_ARGS(dp, ino)) +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_dir_salvaged_parent); +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_dirent); +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_from_dcache); + +DECLARE_EVENT_CLASS(xrep_pptr_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(ip, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(ip, name, pptr)) +DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentadd); +DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentremove); +DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentadd); +DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentremove); + +DECLARE_EVENT_CLASS(xrep_pptr_scan_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, + const struct xfs_name *name), + TP_ARGS(ip, dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->parent_gen = VFS_IC(dp)->i_generation; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_SCAN_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_scan_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \ + const struct xfs_name *name), \ + TP_ARGS(ip, dp, name)) +DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentadd); +DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentremove); + +TRACE_EVENT(xrep_nlinks_set_record, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *obs), + TP_ARGS(mp, ino, obs), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = obs->parents; + __entry->backrefs = obs->backrefs; + __entry->children = obs->children; + ), + TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents, + __entry->backrefs, + __entry->children) +); + +DECLARE_EVENT_CLASS(xrep_dentry_class, + TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), + TP_ARGS(mp, dentry), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, flags) + __field(unsigned long, ino) + __field(bool, positive) + __field(unsigned long, parent_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, dentry->d_name.len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->flags = dentry->d_flags; + __entry->positive = d_is_positive(dentry); + if (dentry->d_parent && d_inode(dentry->d_parent)) + __entry->parent_ino = d_inode(dentry->d_parent)->i_ino; + else + __entry->parent_ino = -1UL; + __entry->ino = d_inode(dentry) ? d_inode(dentry)->i_ino : 0; + __entry->namelen = dentry->d_name.len; + memcpy(__get_str(name), dentry->d_name.name, dentry->d_name.len); + ), + TP_printk("dev %d:%d flags 0x%x positive? %d parent_ino 0x%lx ino 0x%lx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags, + __entry->positive, + __entry->parent_ino, + __entry->ino, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_REPAIR_DENTRY_EVENT(name) \ +DEFINE_EVENT(xrep_dentry_class, name, \ + TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), \ + TP_ARGS(mp, dentry)) +DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_check_child); +DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_invalidate_child); +DEFINE_REPAIR_DENTRY_EVENT(xrep_dirtree_delete_child); + +TRACE_EVENT(xrep_symlink_salvage_target, + TP_PROTO(struct xfs_inode *ip, char *target, unsigned int targetlen), + TP_ARGS(ip, target, targetlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, targetlen) + __dynamic_array(char, target, targetlen + 1) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->targetlen = targetlen; + memcpy(__get_str(target), target, targetlen); + __get_str(target)[targetlen] = 0; + ), + TP_printk("dev %d:%d ip 0x%llx target '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->targetlen, + __get_str(target)) +); + +DECLARE_EVENT_CLASS(xrep_symlink_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ip 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +#define DEFINE_XREP_SYMLINK_EVENT(name) \ +DEFINE_EVENT(xrep_symlink_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild); +DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork); + +TRACE_EVENT(xrep_iunlink_visit, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t bucket_agino, struct xfs_inode *ip), + TP_ARGS(pag, bucket, bucket_agino, ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, bucket) + __field(xfs_agino_t, bucket_agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agino = XFS_INO_TO_AGINO(pag->pag_mount, ip->i_ino); + __entry->bucket = bucket; + __entry->bucket_agino = bucket_agino; + __entry->prev_agino = ip->i_prev_unlinked; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x bucket_agino 0x%x prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->agino, + __entry->bucket_agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_reload_next, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino), + TP_ARGS(ip, prev_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_prev_agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + __field(unsigned int, nlink) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->old_prev_agino = ip->i_prev_unlinked; + __entry->prev_agino = prev_agino; + __entry->next_agino = ip->i_next_unlinked; + __entry->nlink = VFS_I(ip)->i_nlink; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u old_prev_agino %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->nlink, + __entry->old_prev_agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_reload_ondisk, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, nlink) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->nlink, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t prev_agino, xfs_agino_t next_agino), + TP_ARGS(pag, bucket, prev_agino, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->prev_agino = prev_agino; + __entry->next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->prev_agino, + __entry->next_agino) +); + +DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t prev_agino, xfs_agino_t next_agino), + TP_ARGS(pag, bucket, prev_agino, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->prev_agino = prev_agino; + __entry->next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->prev_agino, + __entry->next_agino) +); +#define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \ +DEFINE_EVENT(xrep_iunlink_resolve_class, name, \ + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, \ + xfs_agino_t prev_agino, xfs_agino_t next_agino), \ + TP_ARGS(pag, bucket, prev_agino, next_agino)) +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_wronglist); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_nolist); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_ok); + +TRACE_EVENT(xrep_iunlink_relink_next, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t next_agino), + TP_ARGS(ip, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, next_agino) + __field(xfs_agino_t, new_next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->next_agino = ip->i_next_unlinked; + __entry->new_next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->next_agino, + __entry->new_next_agino) +); + +TRACE_EVENT(xrep_iunlink_relink_prev, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino), + TP_ARGS(ip, prev_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, new_prev_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->prev_agino = ip->i_prev_unlinked; + __entry->new_prev_agino = prev_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x prev_agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->prev_agino, + __entry->new_prev_agino) +); + +TRACE_EVENT(xrep_iunlink_add_to_bucket, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t agino, xfs_agino_t curr_head), + TP_ARGS(pag, bucket, agino, curr_head), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->agino = agino; + __entry->next_agino = curr_head; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_commit_bucket, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t old_agino, xfs_agino_t agino), + TP_ARGS(pag, bucket, old_agino, agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_agino) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->old_agino = old_agino; + __entry->agino = agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_agino, + __entry->agino) +); + +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xrep_dirpath_set_outcome); +DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path); +DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption); +DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index 17c982a4821d..9185ae7088d4 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -7,9 +7,9 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" -#include "scrub/scrub.h" #include "scrub/trace.h" /* @@ -486,6 +486,9 @@ xfarray_sortinfo_alloc( xfarray_sortinfo_lo(si)[0] = 0; xfarray_sortinfo_hi(si)[0] = array->nr - 1; + si->relax = INIT_XCHK_RELAX; + if (flags & XFARRAY_SORT_KILLABLE) + si->relax.interruptible = false; trace_xfarray_sort(si, nr_bytes); *infop = si; @@ -503,10 +506,7 @@ xfarray_sort_terminated( * few seconds so that we don't run afoul of the soft lockup watchdog * or RCU stall detector. */ - cond_resched(); - - if ((si->flags & XFARRAY_SORT_KILLABLE) && - fatal_signal_pending(current)) { + if (xchk_maybe_relax(&si->relax)) { if (*error == 0) *error = -EINTR; return true; @@ -1051,3 +1051,20 @@ out_free: kvfree(si); return error; } + +/* How many bytes is this array consuming? */ +unsigned long long +xfarray_bytes( + struct xfarray *array) +{ + return xfile_bytes(array->xfile); +} + +/* Empty the entire array. */ +void +xfarray_truncate( + struct xfarray *array) +{ + xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE); + array->nr = 0; +} diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index acb2f94c56c1..5eeeeed13ae2 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -8,6 +8,7 @@ /* xfile array index type, along with cursor initialization */ typedef uint64_t xfarray_idx_t; +#define XFARRAY_NULLIDX ((__force xfarray_idx_t)-1ULL) #define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0) /* Iterate each index of an xfile array. */ @@ -44,6 +45,8 @@ int xfarray_unset(struct xfarray *array, xfarray_idx_t idx); int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); int xfarray_store_anywhere(struct xfarray *array, const void *ptr); bool xfarray_element_is_null(struct xfarray *array, const void *ptr); +void xfarray_truncate(struct xfarray *array); +unsigned long long xfarray_bytes(struct xfarray *array); /* * Load an array element, but zero the buffer if there's no data because we @@ -124,6 +127,9 @@ struct xfarray_sortinfo { /* XFARRAY_SORT_* flags; see below. */ unsigned int flags; + /* next time we want to cond_resched() */ + struct xchk_relax relax; + /* Cache a folio here for faster scanning for pivots */ struct folio *folio; diff --git a/fs/xfs/scrub/xfblob.c b/fs/xfs/scrub/xfblob.c new file mode 100644 index 000000000000..6ef2a9637f16 --- /dev/null +++ b/fs/xfs/scrub/xfblob.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "scrub/scrub.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" + +/* + * XFS Blob Storage + * ================ + * Stores and retrieves blobs using an xfile. Objects are appended to the file + * and the offset is returned as a magic cookie for retrieval. + */ + +#define XB_KEY_MAGIC 0xABAADDAD +struct xb_key { + uint32_t xb_magic; /* XB_KEY_MAGIC */ + uint32_t xb_size; /* size of the blob, in bytes */ + loff_t xb_offset; /* byte offset of this key */ + /* blob comes after here */ +} __packed; + +/* Initialize a blob storage object. */ +int +xfblob_create( + const char *description, + struct xfblob **blobp) +{ + struct xfblob *blob; + struct xfile *xfile; + int error; + + error = xfile_create(description, 0, &xfile); + if (error) + return error; + + blob = kmalloc(sizeof(struct xfblob), XCHK_GFP_FLAGS); + if (!blob) { + error = -ENOMEM; + goto out_xfile; + } + + blob->xfile = xfile; + blob->last_offset = PAGE_SIZE; + + *blobp = blob; + return 0; + +out_xfile: + xfile_destroy(xfile); + return error; +} + +/* Destroy a blob storage object. */ +void +xfblob_destroy( + struct xfblob *blob) +{ + xfile_destroy(blob->xfile); + kfree(blob); +} + +/* Retrieve a blob. */ +int +xfblob_load( + struct xfblob *blob, + xfblob_cookie cookie, + void *ptr, + uint32_t size) +{ + struct xb_key key; + int error; + + error = xfile_load(blob->xfile, &key, sizeof(key), cookie); + if (error) + return error; + + if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) { + ASSERT(0); + return -ENODATA; + } + if (size < key.xb_size) { + ASSERT(0); + return -EFBIG; + } + + return xfile_load(blob->xfile, ptr, key.xb_size, + cookie + sizeof(key)); +} + +/* Store a blob. */ +int +xfblob_store( + struct xfblob *blob, + xfblob_cookie *cookie, + const void *ptr, + uint32_t size) +{ + struct xb_key key = { + .xb_offset = blob->last_offset, + .xb_magic = XB_KEY_MAGIC, + .xb_size = size, + }; + loff_t pos = blob->last_offset; + int error; + + error = xfile_store(blob->xfile, &key, sizeof(key), pos); + if (error) + return error; + + pos += sizeof(key); + error = xfile_store(blob->xfile, ptr, size, pos); + if (error) + goto out_err; + + *cookie = blob->last_offset; + blob->last_offset += sizeof(key) + size; + return 0; +out_err: + xfile_discard(blob->xfile, blob->last_offset, sizeof(key)); + return error; +} + +/* Free a blob. */ +int +xfblob_free( + struct xfblob *blob, + xfblob_cookie cookie) +{ + struct xb_key key; + int error; + + error = xfile_load(blob->xfile, &key, sizeof(key), cookie); + if (error) + return error; + + if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) { + ASSERT(0); + return -ENODATA; + } + + xfile_discard(blob->xfile, cookie, sizeof(key) + key.xb_size); + return 0; +} + +/* How many bytes is this blob storage object consuming? */ +unsigned long long +xfblob_bytes( + struct xfblob *blob) +{ + return xfile_bytes(blob->xfile); +} + +/* Drop all the blobs. */ +void +xfblob_truncate( + struct xfblob *blob) +{ + xfile_discard(blob->xfile, PAGE_SIZE, MAX_LFS_FILESIZE - PAGE_SIZE); + blob->last_offset = PAGE_SIZE; +} diff --git a/fs/xfs/scrub/xfblob.h b/fs/xfs/scrub/xfblob.h new file mode 100644 index 000000000000..ae78322613ca --- /dev/null +++ b/fs/xfs/scrub/xfblob.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFBLOB_H__ +#define __XFS_SCRUB_XFBLOB_H__ + +struct xfblob { + struct xfile *xfile; + loff_t last_offset; +}; + +typedef loff_t xfblob_cookie; + +int xfblob_create(const char *descr, struct xfblob **blobp); +void xfblob_destroy(struct xfblob *blob); +int xfblob_load(struct xfblob *blob, xfblob_cookie cookie, void *ptr, + uint32_t size); +int xfblob_store(struct xfblob *blob, xfblob_cookie *cookie, const void *ptr, + uint32_t size); +int xfblob_free(struct xfblob *blob, xfblob_cookie cookie); +unsigned long long xfblob_bytes(struct xfblob *blob); +void xfblob_truncate(struct xfblob *blob); + +static inline int +xfblob_storename( + struct xfblob *blob, + xfblob_cookie *cookie, + const struct xfs_name *xname) +{ + return xfblob_store(blob, cookie, xname->name, xname->len); +} + +static inline int +xfblob_loadname( + struct xfblob *blob, + xfblob_cookie cookie, + struct xfs_name *xname, + uint32_t size) +{ + int ret = xfblob_load(blob, cookie, (void *)xname->name, size); + if (ret) + return ret; + + xname->len = size; + return 0; +} + +#endif /* __XFS_SCRUB_XFBLOB_H__ */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 8cdd863db585..d848222f802b 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -10,9 +10,9 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" -#include "scrub/scrub.h" #include "scrub/trace.h" #include <linux/shmem_fs.h> @@ -310,3 +310,15 @@ xfile_put_folio( folio_unlock(folio); folio_put(folio); } + +/* Discard the page cache that's backing a range of the xfile. */ +void +xfile_discard( + struct xfile *xf, + loff_t pos, + u64 count) +{ + trace_xfile_discard(xf, pos, count); + + shmem_truncate_range(file_inode(xf->file), pos, pos + count - 1); +} diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index 76d78dba7e34..cc2cc1714cd4 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -17,6 +17,7 @@ int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); int xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t pos); +void xfile_discard(struct xfile *xf, loff_t pos, u64 count); loff_t xfile_seek_data(struct xfile *xf, loff_t pos); #define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER) @@ -26,4 +27,9 @@ struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len, unsigned int flags); void xfile_put_folio(struct xfile *xf, struct folio *folio); +static inline unsigned long long xfile_bytes(struct xfile *xf) +{ + return file_inode(xf->file)->i_blocks << SECTOR_SHIFT; +} + #endif /* __XFS_SCRUB_XFILE_H__ */ diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index a39befa743ce..f17173b83e6f 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -7,9 +7,11 @@ #define __XFS_SCRUB_H__ #ifndef CONFIG_XFS_ONLINE_SCRUB -# define xfs_scrub_metadata(file, sm) (-ENOTTY) +# define xfs_ioc_scrub_metadata(f, a) (-ENOTTY) +# define xfs_ioc_scrubv_metadata(f, a) (-ENOTTY) #else -int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm); +int xfs_ioc_scrub_metadata(struct file *file, void __user *arg); +int xfs_ioc_scrubv_metadata(struct file *file, void __user *arg); #endif /* CONFIG_XFS_ONLINE_SCRUB */ #endif /* __XFS_SCRUB_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 4bf69c9c088e..c7c3dcfa2718 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -201,16 +201,17 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (!args.value) return -ENOMEM; xfs_acl_to_disk(args.value, acl); + error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT); + kvfree(args.value); + } else { + error = xfs_attr_change(&args, XFS_ATTRUPDATE_REMOVE); + /* + * If the attribute didn't exist to start with that's fine. + */ + if (error == -ENOATTR) + error = 0; } - error = xfs_attr_change(&args); - kvfree(args.value); - - /* - * If the attribute didn't exist to start with that's fine. - */ - if (!acl && error == -ENOATTR) - error = 0; if (!error) set_cached_acl(inode, type, acl); return error; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3f428620ebf2..6dead20338e2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -233,45 +233,6 @@ xfs_imap_valid( return true; } -/* - * Pass in a dellalloc extent and convert it to real extents, return the real - * extent that maps offset_fsb in wpc->iomap. - * - * The current page is held locked so nothing could have removed the block - * backing offset_fsb, although it could have moved from the COW to the data - * fork by another thread. - */ -static int -xfs_convert_blocks( - struct iomap_writepage_ctx *wpc, - struct xfs_inode *ip, - int whichfork, - loff_t offset) -{ - int error; - unsigned *seq; - - if (whichfork == XFS_COW_FORK) - seq = &XFS_WPC(wpc)->cow_seq; - else - seq = &XFS_WPC(wpc)->data_seq; - - /* - * Attempt to allocate whatever delalloc extent currently backs offset - * and put the result into wpc->iomap. Allocate in a loop because it - * may take several attempts to allocate real blocks for a contiguous - * delalloc extent if free space is sufficiently fragmented. - */ - do { - error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, - &wpc->iomap, seq); - if (error) - return error; - } while (wpc->iomap.offset + wpc->iomap.length <= offset); - - return 0; -} - static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, @@ -290,6 +251,7 @@ xfs_map_blocks( struct xfs_iext_cursor icur; int retries = 0; int error = 0; + unsigned int *seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -387,7 +349,19 @@ retry: trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: - error = xfs_convert_blocks(wpc, ip, whichfork, offset); + /* + * Convert a dellalloc extent to a real one. The current page is held + * locked so nothing could have removed the block backing offset_fsb, + * although it could have moved from the COW to the data fork by another + * thread. + */ + if (whichfork == XFS_COW_FORK) + seq = &XFS_WPC(wpc)->cow_seq; + else + seq = &XFS_WPC(wpc)->data_seq; + + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, + &wpc->iomap, seq); if (error) { /* * If we failed to find the extent in the COW fork we might have @@ -469,7 +443,6 @@ xfs_discard_folio( { struct xfs_inode *ip = XFS_I(folio->mapping->host); struct xfs_mount *mp = ip->i_mount; - int error; if (xfs_is_shutdown(mp)) return; @@ -483,11 +456,8 @@ xfs_discard_folio( * byte of the next folio. Hence the end offset is only dependent on the * folio itself and not the start offset that is passed in. */ - error = xfs_bmap_punch_delalloc_range(ip, pos, + xfs_bmap_punch_delalloc_range(ip, pos, folio_pos(folio) + folio_size(folio)); - - if (error && !xfs_is_shutdown(mp)) - xfs_alert(mp, "page discard unable to remove delalloc mapping."); } static const struct iomap_writeback_ops xfs_writeback_ops = { diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 9b4c61e1c22e..2b10ac4c5fce 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -27,6 +27,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_parent.h" struct kmem_cache *xfs_attri_cache; struct kmem_cache *xfs_attrd_cache; @@ -73,8 +74,12 @@ static inline struct xfs_attri_log_nameval * xfs_attri_log_nameval_alloc( const void *name, unsigned int name_len, + const void *new_name, + unsigned int new_name_len, const void *value, - unsigned int value_len) + unsigned int value_len, + const void *new_value, + unsigned int new_value_len) { struct xfs_attri_log_nameval *nv; @@ -83,15 +88,26 @@ xfs_attri_log_nameval_alloc( * this. But kvmalloc() utterly sucks, so we use our own version. */ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + - name_len + value_len); + name_len + new_name_len + value_len + + new_value_len); nv->name.i_addr = nv + 1; nv->name.i_len = name_len; nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; memcpy(nv->name.i_addr, name, name_len); + if (new_name_len) { + nv->new_name.i_addr = nv->name.i_addr + name_len; + nv->new_name.i_len = new_name_len; + memcpy(nv->new_name.i_addr, new_name, new_name_len); + } else { + nv->new_name.i_addr = NULL; + nv->new_name.i_len = 0; + } + nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME; + if (value_len) { - nv->value.i_addr = nv->name.i_addr + name_len; + nv->value.i_addr = nv->name.i_addr + name_len + new_name_len; nv->value.i_len = value_len; memcpy(nv->value.i_addr, value, value_len); } else { @@ -100,6 +116,17 @@ xfs_attri_log_nameval_alloc( } nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; + if (new_value_len) { + nv->new_value.i_addr = nv->name.i_addr + name_len + + new_name_len + value_len; + nv->new_value.i_len = new_value_len; + memcpy(nv->new_value.i_addr, new_value, new_value_len); + } else { + nv->new_value.i_addr = NULL; + nv->new_value.i_len = 0; + } + nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE; + refcount_set(&nv->refcount, 1); return nv; } @@ -145,11 +172,20 @@ xfs_attri_item_size( *nbytes += sizeof(struct xfs_attri_log_format) + xlog_calc_iovec_len(nv->name.i_len); - if (!nv->value.i_len) - return; + if (nv->new_name.i_len) { + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(nv->new_name.i_len); + } - *nvecs += 1; - *nbytes += xlog_calc_iovec_len(nv->value.i_len); + if (nv->value.i_len) { + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(nv->value.i_len); + } + + if (nv->new_value.i_len) { + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(nv->new_value.i_len); + } } /* @@ -179,15 +215,28 @@ xfs_attri_item_format( ASSERT(nv->name.i_len > 0); attrip->attri_format.alfi_size++; + if (nv->new_name.i_len > 0) + attrip->attri_format.alfi_size++; + if (nv->value.i_len > 0) attrip->attri_format.alfi_size++; + if (nv->new_value.i_len > 0) + attrip->attri_format.alfi_size++; + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, &attrip->attri_format, sizeof(struct xfs_attri_log_format)); xlog_copy_from_iovec(lv, &vecp, &nv->name); + + if (nv->new_name.i_len > 0) + xlog_copy_from_iovec(lv, &vecp, &nv->new_name); + if (nv->value.i_len > 0) xlog_copy_from_iovec(lv, &vecp, &nv->value); + + if (nv->new_value.i_len > 0) + xlog_copy_from_iovec(lv, &vecp, &nv->new_value); } /* @@ -308,6 +357,12 @@ xfs_attrd_item_intent( return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; } +static inline unsigned int +xfs_attr_log_item_op(const struct xfs_attri_log_format *attrp) +{ + return attrp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; +} + /* Log an attr to the intent item. */ STATIC void xfs_attr_log_item( @@ -316,6 +371,8 @@ xfs_attr_log_item( const struct xfs_attr_intent *attr) { struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attr->xattri_nameval; + struct xfs_da_args *args = attr->xattri_da_args; /* * At this point the xfs_attr_intent has been constructed, and we've @@ -323,13 +380,30 @@ xfs_attr_log_item( * structure with fields from this xfs_attr_intent */ attrp = &attrip->attri_format; - attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; + attrp->alfi_ino = args->dp->i_ino; ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; - attrp->alfi_value_len = attr->xattri_nameval->value.i_len; - attrp->alfi_name_len = attr->xattri_nameval->name.i_len; - ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); - attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter; + attrp->alfi_value_len = nv->value.i_len; + + switch (xfs_attr_log_item_op(attrp)) { + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + ASSERT(nv->value.i_len == nv->new_value.i_len); + + attrp->alfi_igen = VFS_I(args->dp)->i_generation; + attrp->alfi_old_name_len = nv->name.i_len; + attrp->alfi_new_name_len = nv->new_name.i_len; + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + attrp->alfi_igen = VFS_I(args->dp)->i_generation; + fallthrough; + default: + attrp->alfi_name_len = nv->name.i_len; + break; + } + + ASSERT(!(args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); + attrp->alfi_attr_filter = args->attr_filter; } /* Get an ATTRI. */ @@ -368,8 +442,11 @@ xfs_attr_create_intent( * Transfer our reference to the name/value buffer to the * deferred work state structure. */ - attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, - args->namelen, args->value, args->valuelen); + attr->xattri_nameval = xfs_attri_log_nameval_alloc( + args->name, args->namelen, + args->new_name, args->new_namelen, + args->value, args->valuelen, + args->new_value, args->new_valuelen); } attrip = xfs_attri_init(mp, attr->xattri_nameval); @@ -460,17 +537,19 @@ xfs_attri_item_match( return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id; } +static inline bool +xfs_attri_validate_namelen(unsigned int namelen) +{ + return namelen > 0 && namelen <= XATTR_NAME_MAX; +} + /* Is this recovered ATTRI format ok? */ static inline bool xfs_attri_validate( struct xfs_mount *mp, struct xfs_attri_log_format *attrp) { - unsigned int op = attrp->alfi_op_flags & - XFS_ATTRI_OP_FLAGS_TYPE_MASK; - - if (attrp->__pad != 0) - return false; + unsigned int op = xfs_attr_log_item_op(attrp); if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK) return false; @@ -478,24 +557,75 @@ xfs_attri_validate( if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) return false; - /* alfi_op_flags should be either a set or remove */ + if (!xfs_attr_check_namespace(attrp->alfi_attr_filter & + XFS_ATTR_NSP_ONDISK_MASK)) + return false; + switch (op) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + if (!xfs_has_parent(mp)) + return false; + if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec)) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_name_len)) + return false; + if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT)) + return false; + break; case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: + if (!xfs_is_using_logged_xattrs(mp)) + return false; + if (attrp->alfi_value_len > XATTR_SIZE_MAX) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_name_len)) + return false; + break; case XFS_ATTRI_OP_FLAGS_REMOVE: + if (!xfs_is_using_logged_xattrs(mp)) + return false; + if (attrp->alfi_value_len != 0) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_name_len)) + return false; + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + if (!xfs_has_parent(mp)) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_old_name_len)) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_new_name_len)) + return false; + if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec)) + return false; + if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT)) + return false; break; default: return false; } - if (attrp->alfi_value_len > XATTR_SIZE_MAX) - return false; + return xfs_verify_ino(mp, attrp->alfi_ino); +} - if ((attrp->alfi_name_len > XATTR_NAME_MAX) || - (attrp->alfi_name_len == 0)) - return false; +static int +xfs_attri_iread_extents( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; - return xfs_verify_ino(mp, attrp->alfi_ino); + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp); + + return error; } static inline struct xfs_attr_intent * @@ -508,20 +638,46 @@ xfs_attri_recover_work( { struct xfs_attr_intent *attr; struct xfs_da_args *args; + struct xfs_inode *ip; int local; int error; - error = xlog_recover_iget(mp, attrp->alfi_ino, ipp); - if (error) - return ERR_PTR(error); + /* + * Parent pointer attr items record the generation but regular logged + * xattrs do not; select the right iget function. + */ + switch (xfs_attr_log_item_op(attrp)) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + error = xlog_recover_iget_handle(mp, attrp->alfi_ino, + attrp->alfi_igen, &ip); + break; + default: + error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + break; + } + if (error) { + xfs_irele(ip); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attrp, + sizeof(*attrp)); + return ERR_PTR(-EFSCORRUPTED); + } + + if (xfs_inode_has_attr_fork(ip)) { + error = xfs_attri_iread_extents(ip); + if (error) { + xfs_irele(ip); + return ERR_PTR(error); + } + } attr = kzalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL); args = (struct xfs_da_args *)(attr + 1); attr->xattri_da_args = args; - attr->xattri_op_flags = attrp->alfi_op_flags & - XFS_ATTRI_OP_FLAGS_TYPE_MASK; + attr->xattri_op_flags = xfs_attr_log_item_op(attrp); /* * We're reconstructing the deferred work state structure from the @@ -531,35 +687,42 @@ xfs_attri_recover_work( attr->xattri_nameval = xfs_attri_log_nameval_get(nv); ASSERT(attr->xattri_nameval); - args->dp = *ipp; + args->dp = ip; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; args->name = nv->name.i_addr; args->namelen = nv->name.i_len; - args->hashval = xfs_da_hashname(args->name, args->namelen); + args->new_name = nv->new_name.i_addr; + args->new_namelen = nv->new_name.i_len; + args->value = nv->value.i_addr; + args->valuelen = nv->value.i_len; + args->new_value = nv->new_value.i_addr; + args->new_valuelen = nv->new_value.i_len; args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | XFS_DA_OP_LOGGED; + args->owner = args->dp->i_ino; + xfs_attr_sethash(args); - ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb)); - - switch (attr->xattri_op_flags) { + switch (xfs_attr_intent_op(attr)) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: - args->value = nv->value.i_addr; - args->valuelen = nv->value.i_len; args->total = xfs_attr_calc_size(args, &local); if (xfs_inode_hasattr(args->dp)) attr->xattri_dela_state = xfs_attr_init_replace_state(args); else attr->xattri_dela_state = xfs_attr_init_add_state(args); break; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_REMOVE: attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; } xfs_defer_add_item(dfp, &attr->xattri_list); + *ipp = ip; return attr; } @@ -591,7 +754,8 @@ xfs_attr_recover_work( */ attrp = &attrip->attri_format; if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr, + nv->name.i_len)) return -EFSCORRUPTED; attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); @@ -614,16 +778,17 @@ xfs_attr_recover_work( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &attrip->attri_format, sizeof(attrip->attri_format)); - if (error) { - xfs_trans_cancel(tp); - goto out_unlock; - } + if (error) + goto out_cancel; error = xfs_defer_ops_capture_and_commit(tp, capture_list); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); return error; +out_cancel: + xfs_trans_cancel(tp); + goto out_unlock; } /* Re-log an intent item to push the log tail forward. */ @@ -649,9 +814,20 @@ xfs_attr_relog_intent( new_attrp = &new_attrip->attri_format; new_attrp->alfi_ino = old_attrp->alfi_ino; + new_attrp->alfi_igen = old_attrp->alfi_igen; new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; new_attrp->alfi_value_len = old_attrp->alfi_value_len; - new_attrp->alfi_name_len = old_attrp->alfi_name_len; + + switch (xfs_attr_log_item_op(old_attrp)) { + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + new_attrp->alfi_new_name_len = old_attrp->alfi_new_name_len; + new_attrp->alfi_old_name_len = old_attrp->alfi_old_name_len; + break; + default: + new_attrp->alfi_name_len = old_attrp->alfi_name_len; + break; + } + new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; return &new_attrip->attri_item; @@ -679,6 +855,75 @@ xfs_attr_create_done( return &attrdp->attrd_item; } +void +xfs_attr_defer_add( + struct xfs_da_args *args, + enum xfs_attr_defer_op op) +{ + struct xfs_attr_intent *new; + unsigned int log_op = 0; + bool is_pptr = args->attr_filter & XFS_ATTR_PARENT; + + if (is_pptr) { + ASSERT(xfs_has_parent(args->dp->i_mount)); + ASSERT((args->attr_filter & ~XFS_ATTR_PARENT) == 0); + ASSERT(args->op_flags & XFS_DA_OP_LOGGED); + ASSERT(args->valuelen == sizeof(struct xfs_parent_rec)); + } + + new = kmem_cache_zalloc(xfs_attr_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + new->xattri_da_args = args; + + /* Compute log operation from the higher level op and namespace. */ + switch (op) { + case XFS_ATTR_DEFER_SET: + if (is_pptr) + log_op = XFS_ATTRI_OP_FLAGS_PPTR_SET; + else + log_op = XFS_ATTRI_OP_FLAGS_SET; + break; + case XFS_ATTR_DEFER_REPLACE: + if (is_pptr) + log_op = XFS_ATTRI_OP_FLAGS_PPTR_REPLACE; + else + log_op = XFS_ATTRI_OP_FLAGS_REPLACE; + break; + case XFS_ATTR_DEFER_REMOVE: + if (is_pptr) + log_op = XFS_ATTRI_OP_FLAGS_PPTR_REMOVE; + else + log_op = XFS_ATTRI_OP_FLAGS_REMOVE; + break; + default: + ASSERT(0); + break; + } + new->xattri_op_flags = log_op; + + /* Set up initial attr operation state. */ + switch (log_op) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_SET: + new->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + ASSERT(args->new_valuelen == args->valuelen); + new->xattri_dela_state = xfs_attr_init_replace_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REPLACE: + new->xattri_dela_state = xfs_attr_init_replace_state(args); + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_REMOVE: + new->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + } + + xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); + trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); +} + const struct xfs_defer_op_type xfs_attr_defer_type = { .name = "attr", .max_items = 1, @@ -691,6 +936,56 @@ const struct xfs_defer_op_type xfs_attr_defer_type = { .relog_intent = xfs_attr_relog_intent, }; +static inline void * +xfs_attri_validate_name_iovec( + struct xfs_mount *mp, + struct xfs_attri_log_format *attri_formatp, + const struct xfs_log_iovec *iovec, + unsigned int name_len) +{ + if (iovec->i_len != xlog_calc_iovec_len(name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, sizeof(*attri_formatp)); + return NULL; + } + + if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr, + name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, sizeof(*attri_formatp)); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + iovec->i_addr, iovec->i_len); + return NULL; + } + + return iovec->i_addr; +} + +static inline void * +xfs_attri_validate_value_iovec( + struct xfs_mount *mp, + struct xfs_attri_log_format *attri_formatp, + const struct xfs_log_iovec *iovec, + unsigned int value_len) +{ + if (iovec->i_len != xlog_calc_iovec_len(value_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, sizeof(*attri_formatp)); + return NULL; + } + + if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) && + !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, sizeof(*attri_formatp)); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + iovec->i_addr, iovec->i_len); + return NULL; + } + + return iovec->i_addr; +} + STATIC int xlog_recover_attri_commit_pass2( struct xlog *log, @@ -702,51 +997,177 @@ xlog_recover_attri_commit_pass2( struct xfs_attri_log_item *attrip; struct xfs_attri_log_format *attri_formatp; struct xfs_attri_log_nameval *nv; - const void *attr_value = NULL; const void *attr_name; + const void *attr_value = NULL; + const void *attr_new_name = NULL; + const void *attr_new_value = NULL; size_t len; - - attri_formatp = item->ri_buf[0].i_addr; - attr_name = item->ri_buf[1].i_addr; + unsigned int name_len = 0; + unsigned int value_len = 0; + unsigned int new_name_len = 0; + unsigned int new_value_len = 0; + unsigned int op, i = 0; /* Validate xfs_attri_log_format before the large memory allocation */ len = sizeof(struct xfs_attri_log_format); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[i].i_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } + attri_formatp = item->ri_buf[i].i_addr; if (!xfs_attri_validate(mp, attri_formatp)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + attri_formatp, len); return -EFSCORRUPTED; } - /* Validate the attr name */ - if (item->ri_buf[1].i_len != - xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { + /* Check the number of log iovecs makes sense for the op code. */ + op = xfs_attr_log_item_op(attri_formatp); + switch (op) { + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + /* Log item, attr name, attr value */ + if (item->ri_total != 3) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + name_len = attri_formatp->alfi_name_len; + value_len = attri_formatp->alfi_value_len; + break; + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + /* Log item, attr name, attr value */ + if (item->ri_total != 3) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + name_len = attri_formatp->alfi_name_len; + value_len = attri_formatp->alfi_value_len; + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + /* Log item, attr name */ + if (item->ri_total != 2) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + name_len = attri_formatp->alfi_name_len; + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + /* + * Log item, attr name, new attr name, attr value, new attr + * value + */ + if (item->ri_total != 5) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + name_len = attri_formatp->alfi_old_name_len; + new_name_len = attri_formatp->alfi_new_name_len; + new_value_len = value_len = attri_formatp->alfi_value_len; + break; + default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + attri_formatp, len); return -EFSCORRUPTED; } + i++; - if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[1].i_addr, item->ri_buf[1].i_len); + /* Validate the attr name */ + attr_name = xfs_attri_validate_name_iovec(mp, attri_formatp, + &item->ri_buf[i], name_len); + if (!attr_name) return -EFSCORRUPTED; + i++; + + /* Validate the new attr name */ + if (new_name_len > 0) { + attr_new_name = xfs_attri_validate_name_iovec(mp, + attri_formatp, &item->ri_buf[i], + new_name_len); + if (!attr_new_name) + return -EFSCORRUPTED; + i++; } /* Validate the attr value, if present */ - if (attri_formatp->alfi_value_len != 0) { - if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { + if (value_len != 0) { + attr_value = xfs_attri_validate_value_iovec(mp, attri_formatp, + &item->ri_buf[i], value_len); + if (!attr_value) + return -EFSCORRUPTED; + i++; + } + + /* Validate the new attr value, if present */ + if (new_value_len != 0) { + attr_new_value = xfs_attri_validate_value_iovec(mp, + attri_formatp, &item->ri_buf[i], + new_value_len); + if (!attr_new_value) + return -EFSCORRUPTED; + i++; + } + + /* + * Make sure we got the correct number of buffers for the operation + * that we just loaded. + */ + if (i != item->ri_total) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + + switch (op) { + case XFS_ATTRI_OP_FLAGS_REMOVE: + /* Regular remove operations operate only on names. */ + if (attr_value != NULL || value_len != 0) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, - item->ri_buf[0].i_len); + attri_formatp, len); return -EFSCORRUPTED; } - - attr_value = item->ri_buf[2].i_addr; + fallthrough; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + /* + * Regular xattr set/remove/replace operations require a name + * and do not take a newname. Values are optional for set and + * replace. + * + * Name-value set/remove operations must have a name, do not + * take a newname, and can take a value. + */ + if (attr_name == NULL || name_len == 0) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + /* + * Name-value replace operations require the caller to + * specify the old and new names and values explicitly. + * Values are optional. + */ + if (attr_name == NULL || name_len == 0) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + if (attr_new_name == NULL || new_name_len == 0) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + break; } /* @@ -754,9 +1175,10 @@ xlog_recover_attri_commit_pass2( * name/value buffer to the recovered incore log item and drop our * reference. */ - nv = xfs_attri_log_nameval_alloc(attr_name, - attri_formatp->alfi_name_len, attr_value, - attri_formatp->alfi_value_len); + nv = xfs_attri_log_nameval_alloc(attr_name, name_len, + attr_new_name, new_name_len, + attr_value, value_len, + attr_new_value, new_value_len); attrip = xfs_attri_init(mp, nv); memcpy(&attrip->attri_format, attri_formatp, len); diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h index 3280a7930287..e74128cbb722 100644 --- a/fs/xfs/xfs_attr_item.h +++ b/fs/xfs/xfs_attr_item.h @@ -13,7 +13,9 @@ struct kmem_zone; struct xfs_attri_log_nameval { struct xfs_log_iovec name; + struct xfs_log_iovec new_name; /* PPTR_REPLACE only */ struct xfs_log_iovec value; + struct xfs_log_iovec new_value; /* PPTR_REPLACE only */ refcount_t refcount; /* name and value follow the end of this struct */ @@ -51,4 +53,12 @@ struct xfs_attrd_log_item { extern struct kmem_cache *xfs_attri_cache; extern struct kmem_cache *xfs_attrd_cache; +enum xfs_attr_defer_op { + XFS_ATTR_DEFER_SET, + XFS_ATTR_DEFER_REMOVE, + XFS_ATTR_DEFER_REPLACE, +}; + +void xfs_attr_defer_add(struct xfs_da_args *args, enum xfs_attr_defer_op op); + #endif /* __XFS_ATTR_ITEM_H__ */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a6819a642cc0..5c947e5ce8b8 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -82,7 +82,8 @@ xfs_attr_shortform_list( (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) { for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(sfe->nameval, + !xfs_attr_namecheck(sfe->flags, + sfe->nameval, sfe->namelen))) { xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; @@ -91,6 +92,7 @@ xfs_attr_shortform_list( sfe->flags, sfe->nameval, (int)sfe->namelen, + &sfe->nameval[sfe->namelen], (int)sfe->valuelen); /* * Either search callback finished early or @@ -122,7 +124,8 @@ xfs_attr_shortform_list( for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || - ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { + ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)) || + !xfs_attr_check_namespace(sfe->flags))) { XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe, @@ -133,12 +136,16 @@ xfs_attr_shortform_list( } sbp->entno = i; - sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); sbp->name = sfe->nameval; sbp->namelen = sfe->namelen; /* These are bytes, and both on-disk, don't endian-flip */ + sbp->value = &sfe->nameval[sfe->namelen], sbp->valuelen = sfe->valuelen; sbp->flags = sfe->flags; + sbp->hash = xfs_attr_hashval(dp->i_mount, sfe->flags, + sfe->nameval, sfe->namelen, + sfe->nameval + sfe->namelen, + sfe->valuelen); sfe = xfs_attr_sf_nextentry(sfe); sbp++; nsbuf++; @@ -177,7 +184,7 @@ xfs_attr_shortform_list( cursor->offset = 0; } if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(sbp->name, + !xfs_attr_namecheck(sbp->flags, sbp->name, sbp->namelen))) { xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); error = -EFSCORRUPTED; @@ -187,6 +194,7 @@ xfs_attr_shortform_list( sbp->flags, sbp->name, sbp->namelen, + sbp->value, sbp->valuelen); if (context->seen_enough) break; @@ -214,6 +222,7 @@ xfs_attr_node_list_lookup( struct xfs_mount *mp = dp->i_mount; struct xfs_trans *tp = context->tp; struct xfs_buf *bp; + xfs_failaddr_t fa; int i; int error = 0; unsigned int expected_level = 0; @@ -238,6 +247,10 @@ xfs_attr_node_list_lookup( goto out_corruptbuf; } + fa = xfs_da3_node_header_check(bp, dp->i_ino); + if (fa) + goto out_corruptbuf; + xfs_da3_node_hdr_from_disk(mp, &nodehdr, node); /* Tree taller than we can handle; bail out! */ @@ -273,6 +286,12 @@ xfs_attr_node_list_lookup( } } + fa = xfs_attr3_leaf_header_check(bp, dp->i_ino); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + goto out_releasebuf; + } + if (expected_level != 0) goto out_corruptbuf; @@ -281,6 +300,7 @@ xfs_attr_node_list_lookup( out_corruptbuf: xfs_buf_mark_corrupt(bp); +out_releasebuf: xfs_trans_brelse(tp, bp); xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; @@ -297,6 +317,7 @@ xfs_attr_node_list( struct xfs_buf *bp; struct xfs_inode *dp = context->dp; struct xfs_mount *mp = dp->i_mount; + xfs_failaddr_t fa; int error = 0; trace_xfs_attr_node_list(context); @@ -310,46 +331,60 @@ xfs_attr_node_list( */ bp = NULL; if (cursor->blkno > 0) { + struct xfs_attr_leaf_entry *entries; + error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp, XFS_ATTR_FORK); if (xfs_metadata_is_sick(error)) xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); - if ((error != 0) && (error != -EFSCORRUPTED)) + if (error != 0 && error != -EFSCORRUPTED) return error; - if (bp) { - struct xfs_attr_leaf_entry *entries; + if (!bp) + goto need_lookup; - node = bp->b_addr; - switch (be16_to_cpu(node->hdr.info.magic)) { - case XFS_DA_NODE_MAGIC: - case XFS_DA3_NODE_MAGIC: - trace_xfs_attr_list_wrong_blk(context); + node = bp->b_addr; + switch (be16_to_cpu(node->hdr.info.magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + trace_xfs_attr_list_wrong_blk(context); + fa = xfs_da3_node_header_check(bp, dp->i_ino); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); + } + xfs_trans_brelse(context->tp, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + leaf = bp->b_addr; + fa = xfs_attr3_leaf_header_check(bp, dp->i_ino); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); xfs_trans_brelse(context->tp, bp); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); bp = NULL; break; - case XFS_ATTR_LEAF_MAGIC: - case XFS_ATTR3_LEAF_MAGIC: - leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, - &leafhdr, leaf); - entries = xfs_attr3_leaf_entryp(leaf); - if (cursor->hashval > be32_to_cpu( - entries[leafhdr.count - 1].hashval)) { - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(context->tp, bp); - bp = NULL; - } else if (cursor->hashval <= be32_to_cpu( - entries[0].hashval)) { - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(context->tp, bp); - bp = NULL; - } - break; - default: + } + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, + &leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(context->tp, bp); + bp = NULL; + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(context->tp, bp); bp = NULL; } + break; + default: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(context->tp, bp); + bp = NULL; } } @@ -359,6 +394,7 @@ xfs_attr_node_list( * Note that start of node block is same as start of leaf block. */ if (bp == NULL) { +need_lookup: error = xfs_attr_node_list_lookup(context, cursor, &bp); if (error || !bp) return error; @@ -380,8 +416,8 @@ xfs_attr_node_list( break; cursor->blkno = leafhdr.forw; xfs_trans_brelse(context->tp, bp); - error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, - &bp); + error = xfs_attr3_leaf_read(context->tp, dp, dp->i_ino, + cursor->blkno, &bp); if (error) return error; } @@ -446,6 +482,7 @@ xfs_attr3_leaf_list_int( */ for (; i < ichdr.count; entry++, i++) { char *name; + void *value; int namelen, valuelen; if (be32_to_cpu(entry->hashval) != cursor->hashval) { @@ -463,6 +500,7 @@ xfs_attr3_leaf_list_int( name_loc = xfs_attr3_leaf_name_local(leaf, i); name = name_loc->nameval; namelen = name_loc->namelen; + value = &name_loc->nameval[name_loc->namelen]; valuelen = be16_to_cpu(name_loc->valuelen); } else { xfs_attr_leaf_name_remote_t *name_rmt; @@ -470,16 +508,18 @@ xfs_attr3_leaf_list_int( name_rmt = xfs_attr3_leaf_name_remote(leaf, i); name = name_rmt->name; namelen = name_rmt->namelen; + value = NULL; valuelen = be32_to_cpu(name_rmt->valuelen); } if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(name, namelen))) { + !xfs_attr_namecheck(entry->flags, name, + namelen))) { xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } context->put_listent(context, entry->flags, - name, namelen, valuelen); + name, namelen, value, valuelen); if (context->seen_enough) break; cursor->offset++; @@ -501,7 +541,8 @@ xfs_attr_leaf_list( trace_xfs_attr_leaf_list(context); context->cursor.blkno = 0; - error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp); + error = xfs_attr3_leaf_read(context->tp, context->dp, + context->dp->i_ino, 0, &bp); if (error) return error; @@ -515,6 +556,7 @@ xfs_attr_list_ilocked( struct xfs_attr_list_context *context) { struct xfs_inode *dp = context->dp; + int error; xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); @@ -525,6 +567,12 @@ xfs_attr_list_ilocked( return 0; if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_list(context); + + /* Prerequisite for xfs_attr_is_leaf */ + error = xfs_iread_extents(NULL, dp, XFS_ATTR_FORK); + if (error) + return error; + if (xfs_attr_is_leaf(dp)) return xfs_attr_leaf_list(context); return xfs_attr_node_list(context); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index d27859a684aa..a19d62e78aa1 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -524,9 +524,7 @@ xfs_bmap_recover_work( else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, iext_delta); + error = xfs_iext_count_extend(tp, ip, work->bi_whichfork, iext_delta); if (error) goto err_cancel; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 19e11d1da660..ac2e77ebb54c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -440,7 +440,7 @@ out_unlock_iolock: * if the ranges only partially overlap them, so it is up to the caller to * ensure that partial blocks are not passed in. */ -int +void xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, xfs_off_t start_byte, @@ -452,7 +452,6 @@ xfs_bmap_punch_delalloc_range( xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; - int error = 0; ASSERT(!xfs_need_iread_extents(ifp)); @@ -476,15 +475,13 @@ xfs_bmap_punch_delalloc_range( continue; } - error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, - &got, &del); - if (error || !xfs_iext_get_extent(ifp, &icur, &got)) + xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del); + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; } /* @@ -542,7 +539,7 @@ xfs_can_free_eofblocks( * forever. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) + if (xfs_inode_has_bigrtalloc(ip)) end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) @@ -713,41 +710,37 @@ xfs_alloc_file_space( if (error) break; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_ADD_NOSPLIT_CNT); - if (error) - goto error; - - error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, - &nimaps); if (error) goto error; - ip->i_diflags |= XFS_DIFLAG_PREALLOC; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - break; - /* * If the allocator cannot find a single free extent large * enough to cover the start block of the requested range, - * xfs_bmapi_write will return 0 but leave *nimaps set to 0. + * xfs_bmapi_write will return -ENOSR. * * In that case we simply need to keep looping with the same * startoffset_fsb so that one of the following allocations * will eventually reach the requested range. */ - if (nimaps) { + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, + &nimaps); + if (error) { + if (error != -ENOSR) + goto error; + error = 0; + } else { startoffset_fsb += imapp->br_blockcount; allocatesize_fsb -= imapp->br_blockcount; } + + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); } return error; @@ -775,10 +768,8 @@ xfs_unmap_extent( if (error) return error; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -843,7 +834,7 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* We can only free complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { + if (xfs_inode_has_bigrtalloc(ip)) { startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb); endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb); } @@ -1054,10 +1045,8 @@ xfs_insert_file_space( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1283,23 +1272,17 @@ xfs_swap_extent_rmap( trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); if (xfs_bmap_is_real_extent(&uirec)) { - error = xfs_iext_count_may_overflow(ip, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } if (xfs_bmap_is_real_extent(&irec)) { - error = xfs_iext_count_may_overflow(tip, + error = xfs_iext_count_extend(tp, tip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 77ecbb753ef2..51f84d8ff372 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) } #endif /* CONFIG_XFS_RT */ -int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, +void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f0fa02264eda..aa4dbda7b536 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -494,6 +494,9 @@ _xfs_buf_obj_cmp( * it stale has not yet committed. i.e. we are * reallocating a busy extent. Skip this buffer and * continue searching for an exact match. + * + * Note: If we're scanning for incore buffers to stale, don't + * complain if we find non-stale buffers. */ if (!(map->bm_flags & XBM_LIVESCAN)) ASSERT(bp->b_flags & XBF_STALE); @@ -2043,7 +2046,7 @@ xfs_setsize_buftarg( btp->bt_meta_sectorsize = sectorsize; btp->bt_meta_sectormask = sectorsize - 1; - if (set_blocksize(btp->bt_bdev, sectorsize)) { + if (set_blocksize(btp->bt_bdev_file, sectorsize)) { xfs_warn(btp->bt_mount, "Cannot set_blocksize to %u on device %pg", sectorsize, btp->bt_bdev); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index cf9296b7e06f..06ac5a7de60a 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -157,7 +157,7 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; - error = xfs_dir3_block_read(args->trans, dp, &bp); + error = xfs_dir3_block_read(args->trans, dp, args->owner, &bp); if (error) return error; @@ -282,7 +282,8 @@ xfs_dir2_leaf_readbuf( new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); if (new_off > *cur_off) *cur_off = new_off; - error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, &bp); + error = xfs_dir3_data_read(args->trans, dp, args->owner, + map.br_startoff, 0, &bp); if (error) goto out; @@ -515,7 +516,6 @@ xfs_readdir( { struct xfs_da_args args = { NULL }; unsigned int lock_mode; - bool isblock; int error; trace_xfs_readdir(dp); @@ -532,23 +532,24 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; args.trans = tp; + args.owner = dp->i_ino; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) return xfs_dir2_sf_getdents(&args, ctx); lock_mode = xfs_ilock_data_map_shared(dp); - error = xfs_dir2_isblock(&args, &isblock); - if (error) - goto out_unlock; - - if (isblock) { + switch (xfs_dir2_format(&args, &error)) { + case XFS_DIR2_FMT_BLOCK: error = xfs_dir2_block_getdents(&args, ctx, &lock_mode); - goto out_unlock; + break; + case XFS_DIR2_FMT_LEAF: + case XFS_DIR2_FMT_NODE: + error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); + break; + default: + break; } - error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); - -out_unlock: if (lock_mode) xfs_iunlock(dp, lock_mode); return error; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 268bb734dc0a..25fe3b932b5a 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -145,14 +145,18 @@ xfs_discard_extents( return error; } +struct xfs_trim_cur { + xfs_agblock_t start; + xfs_extlen_t count; + xfs_agblock_t end; + xfs_extlen_t minlen; + bool by_bno; +}; static int xfs_trim_gather_extents( struct xfs_perag *pag, - xfs_daddr_t start, - xfs_daddr_t end, - xfs_daddr_t minlen, - struct xfs_alloc_rec_incore *tcur, + struct xfs_trim_cur *tcur, struct xfs_busy_extents *extents, uint64_t *blocks_trimmed) { @@ -179,21 +183,26 @@ xfs_trim_gather_extents( if (error) goto out_trans_cancel; - cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); - - /* - * Look up the extent length requested in the AGF and start with it. - */ - if (tcur->ar_startblock == NULLAGBLOCK) - error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); - else - error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, - tcur->ar_blockcount, &i); + if (tcur->by_bno) { + /* sub-AG discard request always starts at tcur->start */ + cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); + error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i); + if (!error && !i) + error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i); + } else if (tcur->start == 0) { + /* first time through a by-len starts with max length */ + cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); + error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i); + } else { + /* nth time through a by-len starts where we left off */ + cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); + error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i); + } if (error) goto out_del_cursor; if (i == 0) { /* nothing of that length left in the AG, we are done */ - tcur->ar_blockcount = 0; + tcur->count = 0; goto out_del_cursor; } @@ -204,8 +213,6 @@ xfs_trim_gather_extents( while (i) { xfs_agblock_t fbno; xfs_extlen_t flen; - xfs_daddr_t dbno; - xfs_extlen_t dlen; error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) @@ -221,38 +228,46 @@ xfs_trim_gather_extents( * Update the cursor to point at this extent so we * restart the next batch from this extent. */ - tcur->ar_startblock = fbno; - tcur->ar_blockcount = flen; - break; - } - - /* - * use daddr format for all range/len calculations as that is - * the format the range/len variables are supplied in by - * userspace. - */ - dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno); - dlen = XFS_FSB_TO_BB(mp, flen); - - /* - * Too small? Give up. - */ - if (dlen < minlen) { - trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); - tcur->ar_blockcount = 0; + tcur->start = fbno; + tcur->count = flen; break; } /* * If the extent is entirely outside of the range we are - * supposed to discard skip it. Do not bother to trim - * down partially overlapping ranges for now. + * supposed to skip it. Do not bother to trim down partially + * overlapping ranges for now. */ - if (dbno + dlen < start || dbno > end) { + if (fbno + flen < tcur->start) { + trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); + goto next_extent; + } + if (fbno > tcur->end) { trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); + if (tcur->by_bno) { + tcur->count = 0; + break; + } goto next_extent; } + /* Trim the extent returned to the range we want. */ + if (fbno < tcur->start) { + flen -= tcur->start - fbno; + fbno = tcur->start; + } + if (fbno + flen > tcur->end + 1) + flen = tcur->end - fbno + 1; + + /* Too small? Give up. */ + if (flen < tcur->minlen) { + trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + if (tcur->by_bno) + goto next_extent; + tcur->count = 0; + break; + } + /* * If any blocks in the range are still busy, skip the * discard and try again the next time. @@ -266,7 +281,10 @@ xfs_trim_gather_extents( &extents->extent_list); *blocks_trimmed += flen; next_extent: - error = xfs_btree_decrement(cur, 0, &i); + if (tcur->by_bno) + error = xfs_btree_increment(cur, 0, &i); + else + error = xfs_btree_decrement(cur, 0, &i); if (error) break; @@ -276,7 +294,7 @@ next_extent: * is no more extents to search. */ if (i == 0) - tcur->ar_blockcount = 0; + tcur->count = 0; } /* @@ -306,17 +324,22 @@ xfs_trim_should_stop(void) static int xfs_trim_extents( struct xfs_perag *pag, - xfs_daddr_t start, - xfs_daddr_t end, - xfs_daddr_t minlen, + xfs_agblock_t start, + xfs_agblock_t end, + xfs_extlen_t minlen, uint64_t *blocks_trimmed) { - struct xfs_alloc_rec_incore tcur = { - .ar_blockcount = pag->pagf_longest, - .ar_startblock = NULLAGBLOCK, + struct xfs_trim_cur tcur = { + .start = start, + .count = pag->pagf_longest, + .end = end, + .minlen = minlen, }; int error = 0; + if (start != 0 || end != pag->block_count) + tcur.by_bno = true; + do { struct xfs_busy_extents *extents; @@ -330,8 +353,8 @@ xfs_trim_extents( extents->owner = extents; INIT_LIST_HEAD(&extents->extent_list); - error = xfs_trim_gather_extents(pag, start, end, minlen, - &tcur, extents, blocks_trimmed); + error = xfs_trim_gather_extents(pag, &tcur, extents, + blocks_trimmed); if (error) { kfree(extents); break; @@ -354,7 +377,7 @@ xfs_trim_extents( if (xfs_trim_should_stop()) break; - } while (tcur.ar_blockcount != 0); + } while (tcur.count != 0); return error; @@ -378,8 +401,10 @@ xfs_ioc_trim( unsigned int granularity = bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); struct fstrim_range range; - xfs_daddr_t start, end, minlen; - xfs_agnumber_t agno; + xfs_daddr_t start, end; + xfs_extlen_t minlen; + xfs_agnumber_t start_agno, end_agno; + xfs_agblock_t start_agbno, end_agbno; uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -399,7 +424,8 @@ xfs_ioc_trim( return -EFAULT; range.minlen = max_t(u64, granularity, range.minlen); - minlen = BTOBB(range.minlen); + minlen = XFS_B_TO_FSB(mp, range.minlen); + /* * Truncating down the len isn't actually quite correct, but using * BBTOB would mean we trivially get overflows for values @@ -413,15 +439,21 @@ xfs_ioc_trim( return -EINVAL; start = BTOBB(range.start); - end = start + BTOBBT(range.len) - 1; + end = min_t(xfs_daddr_t, start + BTOBBT(range.len), + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) - 1; + + start_agno = xfs_daddr_to_agno(mp, start); + start_agbno = xfs_daddr_to_agbno(mp, start); + end_agno = xfs_daddr_to_agno(mp, end); + end_agbno = xfs_daddr_to_agbno(mp, end); - if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) - end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1; + for_each_perag_range(mp, start_agno, end_agno, pag) { + xfs_agblock_t agend = pag->block_count; - agno = xfs_daddr_to_agno(mp, start); - for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { - error = xfs_trim_extents(pag, start, end, minlen, - &blocks_trimmed); + if (start_agno == end_agno) + agend = end_agbno; + error = xfs_trim_extents(pag, start_agbno, agend, minlen, + &blocks_trimmed); if (error) last_error = error; @@ -429,6 +461,7 @@ xfs_ioc_trim( xfs_perag_rele(pag); break; } + start_agbno = 0; } if (last_error) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index c98cb468c357..c1b211c260a9 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -341,11 +341,8 @@ xfs_dquot_disk_alloc( goto err_cancel; } - error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, quotip, - XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto err_cancel; @@ -357,7 +354,6 @@ xfs_dquot_disk_alloc( goto err_cancel; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); - ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -1371,6 +1367,47 @@ xfs_dqlock2( } } +static int +xfs_dqtrx_cmp( + const void *a, + const void *b) +{ + const struct xfs_dqtrx *qa = a; + const struct xfs_dqtrx *qb = b; + + if (qa->qt_dquot->q_id > qb->qt_dquot->q_id) + return 1; + if (qa->qt_dquot->q_id < qb->qt_dquot->q_id) + return -1; + return 0; +} + +void +xfs_dqlockn( + struct xfs_dqtrx *q) +{ + unsigned int i; + + BUILD_BUG_ON(XFS_QM_TRANS_MAXDQS > MAX_LOCKDEP_SUBCLASSES); + + /* Sort in order of dquot id, do not allow duplicates */ + for (i = 0; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++) { + unsigned int j; + + for (j = 0; j < i; j++) + ASSERT(q[i].qt_dquot != q[j].qt_dquot); + } + if (i == 0) + return; + + sort(q, i, sizeof(struct xfs_dqtrx), xfs_dqtrx_cmp, NULL); + + mutex_lock(&q[0].qt_dquot->q_qlock); + for (i = 1; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++) + mutex_lock_nested(&q[i].qt_dquot->q_qlock, + XFS_QLOCK_NESTED + i - 1); +} + int __init xfs_qm_init(void) { diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 956272d9b302..677bb2dc9ac9 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -223,6 +223,7 @@ int xfs_qm_dqget_uncached(struct xfs_mount *mp, void xfs_qm_dqput(struct xfs_dquot *dqp); void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +void xfs_dqlockn(struct xfs_dqtrx *q); void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 7ad0e92c6b5b..78cdc5064a8c 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -62,6 +62,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_ATTR_LEAF_TO_NODE, XFS_RANDOM_WB_DELAY_MS, XFS_RANDOM_WRITE_DELAY_MS, + XFS_RANDOM_EXCHMAPS_FINISH_ONE, }; struct xfs_errortag_attr { @@ -179,6 +180,7 @@ XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); +XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -224,6 +226,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), XFS_ERRORTAG_ATTR_LIST(write_delay_ms), + XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c new file mode 100644 index 000000000000..264a121c5e16 --- /dev/null +++ b/fs/xfs/xfs_exchmaps_item.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_shared.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_exchmaps_item.h" +#include "xfs_exchmaps.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_exchrange.h" +#include "xfs_trace.h" + +struct kmem_cache *xfs_xmi_cache; +struct kmem_cache *xfs_xmd_cache; + +static const struct xfs_item_ops xfs_xmi_item_ops; + +static inline struct xfs_xmi_log_item *XMI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_xmi_log_item, xmi_item); +} + +STATIC void +xfs_xmi_item_free( + struct xfs_xmi_log_item *xmi_lip) +{ + kvfree(xmi_lip->xmi_item.li_lv_shadow); + kmem_cache_free(xfs_xmi_cache, xmi_lip); +} + +/* + * Freeing the XMI requires that we remove it from the AIL if it has already + * been placed there. However, the XMI may not yet have been placed in the AIL + * when called by xfs_xmi_release() from XMD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the XMI. + */ +STATIC void +xfs_xmi_release( + struct xfs_xmi_log_item *xmi_lip) +{ + ASSERT(atomic_read(&xmi_lip->xmi_refcount) > 0); + if (atomic_dec_and_test(&xmi_lip->xmi_refcount)) { + xfs_trans_ail_delete(&xmi_lip->xmi_item, 0); + xfs_xmi_item_free(xmi_lip); + } +} + + +STATIC void +xfs_xmi_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_xmi_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the given xmi log + * item. We use only 1 iovec, and we point that at the xmi_log_format structure + * embedded in the xmi item. + */ +STATIC void +xfs_xmi_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + xmi_lip->xmi_format.xmi_type = XFS_LI_XMI; + xmi_lip->xmi_format.xmi_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_XMI_FORMAT, + &xmi_lip->xmi_format, + sizeof(struct xfs_xmi_log_format)); +} + +/* + * The unpin operation is the last place an XMI is manipulated in the log. It + * is either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the XMI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the XMI to either construct + * and commit the XMD or drop the XMD's reference in the event of error. Simply + * drop the log's XMI reference now that the log is done with it. + */ +STATIC void +xfs_xmi_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip); + + xfs_xmi_release(xmi_lip); +} + +/* + * The XMI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an XMD isn't going to be + * constructed and thus we free the XMI here directly. + */ +STATIC void +xfs_xmi_item_release( + struct xfs_log_item *lip) +{ + xfs_xmi_release(XMI_ITEM(lip)); +} + +/* Allocate and initialize an xmi item. */ +STATIC struct xfs_xmi_log_item * +xfs_xmi_init( + struct xfs_mount *mp) + +{ + struct xfs_xmi_log_item *xmi_lip; + + xmi_lip = kmem_cache_zalloc(xfs_xmi_cache, GFP_KERNEL | __GFP_NOFAIL); + + xfs_log_item_init(mp, &xmi_lip->xmi_item, XFS_LI_XMI, &xfs_xmi_item_ops); + xmi_lip->xmi_format.xmi_id = (uintptr_t)(void *)xmi_lip; + atomic_set(&xmi_lip->xmi_refcount, 2); + + return xmi_lip; +} + +static inline struct xfs_xmd_log_item *XMD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_xmd_log_item, xmd_item); +} + +STATIC void +xfs_xmd_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_xmd_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the given xmd log + * item. We use only 1 iovec, and we point that at the xmd_log_format structure + * embedded in the xmd item. + */ +STATIC void +xfs_xmd_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_xmd_log_item *xmd_lip = XMD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + xmd_lip->xmd_format.xmd_type = XFS_LI_XMD; + xmd_lip->xmd_format.xmd_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_XMD_FORMAT, &xmd_lip->xmd_format, + sizeof(struct xfs_xmd_log_format)); +} + +/* + * The XMD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the XMI and free the + * XMD. + */ +STATIC void +xfs_xmd_item_release( + struct xfs_log_item *lip) +{ + struct xfs_xmd_log_item *xmd_lip = XMD_ITEM(lip); + + xfs_xmi_release(xmd_lip->xmd_intent_log_item); + kvfree(xmd_lip->xmd_item.li_lv_shadow); + kmem_cache_free(xfs_xmd_cache, xmd_lip); +} + +static struct xfs_log_item * +xfs_xmd_item_intent( + struct xfs_log_item *lip) +{ + return &XMD_ITEM(lip)->xmd_intent_log_item->xmi_item; +} + +static const struct xfs_item_ops xfs_xmd_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, + .iop_size = xfs_xmd_item_size, + .iop_format = xfs_xmd_item_format, + .iop_release = xfs_xmd_item_release, + .iop_intent = xfs_xmd_item_intent, +}; + +/* Log file mapping exchange information in the intent item. */ +STATIC struct xfs_log_item * +xfs_exchmaps_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_xmi_log_item *xmi_lip; + struct xfs_exchmaps_intent *xmi; + struct xfs_xmi_log_format *xlf; + + ASSERT(count == 1); + + xmi = list_first_entry_or_null(items, struct xfs_exchmaps_intent, + xmi_list); + + xmi_lip = xfs_xmi_init(tp->t_mountp); + xlf = &xmi_lip->xmi_format; + + xlf->xmi_inode1 = xmi->xmi_ip1->i_ino; + xlf->xmi_igen1 = VFS_I(xmi->xmi_ip1)->i_generation; + xlf->xmi_inode2 = xmi->xmi_ip2->i_ino; + xlf->xmi_igen2 = VFS_I(xmi->xmi_ip2)->i_generation; + xlf->xmi_startoff1 = xmi->xmi_startoff1; + xlf->xmi_startoff2 = xmi->xmi_startoff2; + xlf->xmi_blockcount = xmi->xmi_blockcount; + xlf->xmi_isize1 = xmi->xmi_isize1; + xlf->xmi_isize2 = xmi->xmi_isize2; + xlf->xmi_flags = xmi->xmi_flags & XFS_EXCHMAPS_LOGGED_FLAGS; + + return &xmi_lip->xmi_item; +} + +STATIC struct xfs_log_item * +xfs_exchmaps_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(intent); + struct xfs_xmd_log_item *xmd_lip; + + xmd_lip = kmem_cache_zalloc(xfs_xmd_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &xmd_lip->xmd_item, XFS_LI_XMD, + &xfs_xmd_item_ops); + xmd_lip->xmd_intent_log_item = xmi_lip; + xmd_lip->xmd_format.xmd_xmi_id = xmi_lip->xmi_format.xmi_id; + + return &xmd_lip->xmd_item; +} + +/* Add this deferred XMI to the transaction. */ +void +xfs_exchmaps_defer_add( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + trace_xfs_exchmaps_defer(tp->t_mountp, xmi); + + xfs_defer_add(tp, &xmi->xmi_list, &xfs_exchmaps_defer_type); +} + +static inline struct xfs_exchmaps_intent *xmi_entry(const struct list_head *e) +{ + return list_entry(e, struct xfs_exchmaps_intent, xmi_list); +} + +/* Cancel a deferred file mapping exchange. */ +STATIC void +xfs_exchmaps_cancel_item( + struct list_head *item) +{ + struct xfs_exchmaps_intent *xmi = xmi_entry(item); + + kmem_cache_free(xfs_exchmaps_intent_cache, xmi); +} + +/* Process a deferred file mapping exchange. */ +STATIC int +xfs_exchmaps_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_exchmaps_intent *xmi = xmi_entry(item); + int error; + + /* + * Exchange one more mappings between two files. If there's still more + * work to do, we want to requeue ourselves after all other pending + * deferred operations have finished. This includes all of the dfops + * that we queued directly as well as any new ones created in the + * process of finishing the others. Doing so prevents us from queuing + * a large number of XMI log items in kernel memory, which in turn + * prevents us from pinning the tail of the log (while logging those + * new XMI items) until the first XMI items can be processed. + */ + error = xfs_exchmaps_finish_one(tp, xmi); + if (error != -EAGAIN) + xfs_exchmaps_cancel_item(item); + return error; +} + +/* Abort all pending XMIs. */ +STATIC void +xfs_exchmaps_abort_intent( + struct xfs_log_item *intent) +{ + xfs_xmi_release(XMI_ITEM(intent)); +} + +/* Is this recovered XMI ok? */ +static inline bool +xfs_xmi_validate( + struct xfs_mount *mp, + struct xfs_xmi_log_item *xmi_lip) +{ + struct xfs_xmi_log_format *xlf = &xmi_lip->xmi_format; + + if (!xfs_has_exchange_range(mp)) + return false; + + if (xmi_lip->xmi_format.__pad != 0) + return false; + + if (xlf->xmi_flags & ~XFS_EXCHMAPS_LOGGED_FLAGS) + return false; + + if (!xfs_verify_ino(mp, xlf->xmi_inode1) || + !xfs_verify_ino(mp, xlf->xmi_inode2)) + return false; + + if (!xfs_verify_fileext(mp, xlf->xmi_startoff1, xlf->xmi_blockcount)) + return false; + + return xfs_verify_fileext(mp, xlf->xmi_startoff2, xlf->xmi_blockcount); +} + +/* + * Use the recovered log state to create a new request, estimate resource + * requirements, and create a new incore intent state. + */ +STATIC struct xfs_exchmaps_intent * +xfs_xmi_item_recover_intent( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + const struct xfs_xmi_log_format *xlf, + struct xfs_exchmaps_req *req, + struct xfs_inode **ipp1, + struct xfs_inode **ipp2) +{ + struct xfs_inode *ip1, *ip2; + struct xfs_exchmaps_intent *xmi; + int error; + + /* + * Grab both inodes and set IRECOVERY to prevent trimming of post-eof + * mappings and freeing of unlinked inodes until we're totally done + * processing files. The ondisk format of this new log item contains + * file handle information, which is why recovery for other items do + * not check the inode generation number. + */ + error = xlog_recover_iget_handle(mp, xlf->xmi_inode1, xlf->xmi_igen1, + &ip1); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, xlf, + sizeof(*xlf)); + return ERR_PTR(error); + } + + error = xlog_recover_iget_handle(mp, xlf->xmi_inode2, xlf->xmi_igen2, + &ip2); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, xlf, + sizeof(*xlf)); + goto err_rele1; + } + + req->ip1 = ip1; + req->ip2 = ip2; + req->startoff1 = xlf->xmi_startoff1; + req->startoff2 = xlf->xmi_startoff2; + req->blockcount = xlf->xmi_blockcount; + req->flags = xlf->xmi_flags & XFS_EXCHMAPS_PARAMS; + + xfs_exchrange_ilock(NULL, ip1, ip2); + error = xfs_exchmaps_estimate(req); + xfs_exchrange_iunlock(ip1, ip2); + if (error) + goto err_rele2; + + *ipp1 = ip1; + *ipp2 = ip2; + xmi = xfs_exchmaps_init_intent(req); + xfs_defer_add_item(dfp, &xmi->xmi_list); + return xmi; + +err_rele2: + xfs_irele(ip2); +err_rele1: + xfs_irele(ip1); + req->ip2 = req->ip1 = NULL; + return ERR_PTR(error); +} + +/* Process a file mapping exchange item that was recovered from the log. */ +STATIC int +xfs_exchmaps_recover_work( + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + struct xfs_exchmaps_req req = { .flags = 0 }; + struct xfs_trans_res resv; + struct xfs_exchmaps_intent *xmi; + struct xfs_log_item *lip = dfp->dfp_intent; + struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip); + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_trans *tp; + struct xfs_inode *ip1, *ip2; + int error = 0; + + if (!xfs_xmi_validate(mp, xmi_lip)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &xmi_lip->xmi_format, + sizeof(xmi_lip->xmi_format)); + return -EFSCORRUPTED; + } + + xmi = xfs_xmi_item_recover_intent(mp, dfp, &xmi_lip->xmi_format, &req, + &ip1, &ip2); + if (IS_ERR(xmi)) + return PTR_ERR(xmi); + + trace_xfs_exchmaps_recover(mp, xmi); + + resv = xlog_recover_resv(&M_RES(mp)->tr_write); + error = xfs_trans_alloc(mp, &resv, req.resblks, 0, 0, &tp); + if (error) + goto err_rele; + + xfs_exchrange_ilock(tp, ip1, ip2); + + xfs_exchmaps_ensure_reflink(tp, xmi); + xfs_exchmaps_upgrade_extent_counts(tp, xmi); + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &xmi_lip->xmi_format, + sizeof(xmi_lip->xmi_format)); + if (error) + goto err_cancel; + + /* + * Commit transaction, which frees the transaction and saves the inodes + * for later replay activities. + */ + error = xfs_defer_ops_capture_and_commit(tp, capture_list); + goto err_unlock; + +err_cancel: + xfs_trans_cancel(tp); +err_unlock: + xfs_exchrange_iunlock(ip1, ip2); +err_rele: + xfs_irele(ip2); + xfs_irele(ip1); + return error; +} + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_exchmaps_relog_intent( + struct xfs_trans *tp, + struct xfs_log_item *intent, + struct xfs_log_item *done_item) +{ + struct xfs_xmi_log_item *xmi_lip; + struct xfs_xmi_log_format *old_xlf, *new_xlf; + + old_xlf = &XMI_ITEM(intent)->xmi_format; + + xmi_lip = xfs_xmi_init(tp->t_mountp); + new_xlf = &xmi_lip->xmi_format; + + new_xlf->xmi_inode1 = old_xlf->xmi_inode1; + new_xlf->xmi_inode2 = old_xlf->xmi_inode2; + new_xlf->xmi_igen1 = old_xlf->xmi_igen1; + new_xlf->xmi_igen2 = old_xlf->xmi_igen2; + new_xlf->xmi_startoff1 = old_xlf->xmi_startoff1; + new_xlf->xmi_startoff2 = old_xlf->xmi_startoff2; + new_xlf->xmi_blockcount = old_xlf->xmi_blockcount; + new_xlf->xmi_flags = old_xlf->xmi_flags; + new_xlf->xmi_isize1 = old_xlf->xmi_isize1; + new_xlf->xmi_isize2 = old_xlf->xmi_isize2; + + return &xmi_lip->xmi_item; +} + +const struct xfs_defer_op_type xfs_exchmaps_defer_type = { + .name = "exchmaps", + .max_items = 1, + .create_intent = xfs_exchmaps_create_intent, + .abort_intent = xfs_exchmaps_abort_intent, + .create_done = xfs_exchmaps_create_done, + .finish_item = xfs_exchmaps_finish_item, + .cancel_item = xfs_exchmaps_cancel_item, + .recover_work = xfs_exchmaps_recover_work, + .relog_intent = xfs_exchmaps_relog_intent, +}; + +STATIC bool +xfs_xmi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return XMI_ITEM(lip)->xmi_format.xmi_id == intent_id; +} + +static const struct xfs_item_ops xfs_xmi_item_ops = { + .flags = XFS_ITEM_INTENT, + .iop_size = xfs_xmi_item_size, + .iop_format = xfs_xmi_item_format, + .iop_unpin = xfs_xmi_item_unpin, + .iop_release = xfs_xmi_item_release, + .iop_match = xfs_xmi_item_match, +}; + +/* + * This routine is called to create an in-core file mapping exchange item from + * the xmi format structure which was logged on disk. It allocates an in-core + * xmi, copies the exchange information from the format structure into it, and + * adds the xmi to the AIL with the given LSN. + */ +STATIC int +xlog_recover_xmi_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_xmi_log_item *xmi_lip; + struct xfs_xmi_log_format *xmi_formatp; + size_t len; + + len = sizeof(struct xfs_xmi_log_format); + if (item->ri_buf[0].i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xmi_formatp = item->ri_buf[0].i_addr; + if (xmi_formatp->__pad != 0) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xmi_lip = xfs_xmi_init(mp); + memcpy(&xmi_lip->xmi_format, xmi_formatp, len); + + xlog_recover_intent_item(log, &xmi_lip->xmi_item, lsn, + &xfs_exchmaps_defer_type); + return 0; +} + +const struct xlog_recover_item_ops xlog_xmi_item_ops = { + .item_type = XFS_LI_XMI, + .commit_pass2 = xlog_recover_xmi_commit_pass2, +}; + +/* + * This routine is called when an XMD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding XMI if it + * was still in the log. To do this it searches the AIL for the XMI with an id + * equal to that in the XMD format structure. If we find it we drop the XMD + * reference, which removes the XMI from the AIL and frees it. + */ +STATIC int +xlog_recover_xmd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_xmd_log_format *xmd_formatp; + + xmd_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_XMI, xmd_formatp->xmd_xmi_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_xmd_item_ops = { + .item_type = XFS_LI_XMD, + .commit_pass2 = xlog_recover_xmd_commit_pass2, +}; diff --git a/fs/xfs/xfs_exchmaps_item.h b/fs/xfs/xfs_exchmaps_item.h new file mode 100644 index 000000000000..efa368d25d09 --- /dev/null +++ b/fs/xfs/xfs_exchmaps_item.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_EXCHMAPS_ITEM_H__ +#define __XFS_EXCHMAPS_ITEM_H__ + +/* + * The file mapping exchange intent item helps us exchange multiple file + * mappings between two inode forks. It does this by tracking the range of + * file block offsets that still need to be exchanged, and relogs as progress + * happens. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same transaction + * that records the associated bmbt updates. + * + * Should the system crash after the commit of the first transaction but + * before the commit of the final transaction in a series, log recovery will + * use the redo information recorded by the intent items to replay the + * rest of the mapping exchanges. + */ + +/* kernel only XMI/XMD definitions */ + +struct xfs_mount; +struct kmem_cache; + +/* + * This is the incore file mapping exchange intent log item. It is used to log + * the fact that we are exchanging mappings between two files. It is used in + * conjunction with the incore file mapping exchange done log item described + * below. + * + * These log items follow the same rules as struct xfs_efi_log_item; see the + * comments about that structure (in xfs_extfree_item.h) for more details. + */ +struct xfs_xmi_log_item { + struct xfs_log_item xmi_item; + atomic_t xmi_refcount; + struct xfs_xmi_log_format xmi_format; +}; + +/* + * This is the incore file mapping exchange done log item. It is used to log + * the fact that an exchange mentioned in an earlier xmi item have been + * performed. + */ +struct xfs_xmd_log_item { + struct xfs_log_item xmd_item; + struct xfs_xmi_log_item *xmd_intent_log_item; + struct xfs_xmd_log_format xmd_format; +}; + +extern struct kmem_cache *xfs_xmi_cache; +extern struct kmem_cache *xfs_xmd_cache; + +struct xfs_exchmaps_intent; + +void xfs_exchmaps_defer_add(struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi); + +#endif /* __XFS_EXCHMAPS_ITEM_H__ */ diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c new file mode 100644 index 000000000000..c8a655c92c92 --- /dev/null +++ b/fs/xfs/xfs_exchrange.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_quota.h" +#include "xfs_bmap_util.h" +#include "xfs_reflink.h" +#include "xfs_trace.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_sb.h" +#include "xfs_icache.h" +#include "xfs_log.h" +#include "xfs_rtbitmap.h" +#include <linux/fsnotify.h> + +/* Lock (and optionally join) two inodes for a file range exchange. */ +void +xfs_exchrange_ilock( + struct xfs_trans *tp, + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + if (ip1 != ip2) + xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL, + ip2, XFS_ILOCK_EXCL); + else + xfs_ilock(ip1, XFS_ILOCK_EXCL); + if (tp) { + xfs_trans_ijoin(tp, ip1, 0); + if (ip2 != ip1) + xfs_trans_ijoin(tp, ip2, 0); + } + +} + +/* Unlock two inodes after a file range exchange operation. */ +void +xfs_exchrange_iunlock( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + if (ip2 != ip1) + xfs_iunlock(ip2, XFS_ILOCK_EXCL); + xfs_iunlock(ip1, XFS_ILOCK_EXCL); +} + +/* + * Estimate the resource requirements to exchange file contents between the two + * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to + * have flushed both inodes' pagecache and active direct-ios. + */ +int +xfs_exchrange_estimate( + struct xfs_exchmaps_req *req) +{ + int error; + + xfs_exchrange_ilock(NULL, req->ip1, req->ip2); + error = xfs_exchmaps_estimate(req); + xfs_exchrange_iunlock(req->ip1, req->ip2); + return error; +} + +#define QRETRY_IP1 (0x1) +#define QRETRY_IP2 (0x2) + +/* + * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip + * this if quota enforcement is disabled or if both inodes' dquots are the + * same. The qretry structure must be initialized to zeroes before the first + * call to this function. + */ +STATIC int +xfs_exchrange_reserve_quota( + struct xfs_trans *tp, + const struct xfs_exchmaps_req *req, + unsigned int *qretry) +{ + int64_t ddelta, rdelta; + int ip1_error = 0; + int error; + + /* + * Don't bother with a quota reservation if we're not enforcing them + * or the two inodes have the same dquots. + */ + if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || + (req->ip1->i_udquot == req->ip2->i_udquot && + req->ip1->i_gdquot == req->ip2->i_gdquot && + req->ip1->i_pdquot == req->ip2->i_pdquot)) + return 0; + + *qretry = 0; + + /* + * For each file, compute the net gain in the number of regular blocks + * that will be mapped into that file and reserve that much quota. The + * quota counts must be able to absorb at least that much space. + */ + ddelta = req->ip2_bcount - req->ip1_bcount; + rdelta = req->ip2_rtbcount - req->ip1_rtbcount; + if (ddelta > 0 || rdelta > 0) { + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, + ddelta > 0 ? ddelta : 0, + rdelta > 0 ? rdelta : 0, + false); + if (error == -EDQUOT || error == -ENOSPC) { + /* + * Save this error and see what happens if we try to + * reserve quota for ip2. Then report both. + */ + *qretry |= QRETRY_IP1; + ip1_error = error; + error = 0; + } + if (error) + return error; + } + if (ddelta < 0 || rdelta < 0) { + error = xfs_trans_reserve_quota_nblks(tp, req->ip2, + ddelta < 0 ? -ddelta : 0, + rdelta < 0 ? -rdelta : 0, + false); + if (error == -EDQUOT || error == -ENOSPC) + *qretry |= QRETRY_IP2; + if (error) + return error; + } + if (ip1_error) + return ip1_error; + + /* + * For each file, forcibly reserve the gross gain in mapped blocks so + * that we don't trip over any quota block reservation assertions. + * We must reserve the gross gain because the quota code subtracts from + * bcount the number of blocks that we unmap; it does not add that + * quantity back to the quota block reservation. + */ + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount, + req->ip1_rtbcount, true); + if (error) + return error; + + return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount, + req->ip2_rtbcount, true); +} + +/* Exchange the mappings (and hence the contents) of two files' forks. */ +STATIC int +xfs_exchrange_mappings( + const struct xfs_exchrange *fxr, + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + struct xfs_mount *mp = ip1->i_mount; + struct xfs_exchmaps_req req = { + .ip1 = ip1, + .ip2 = ip2, + .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset), + .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset), + .blockcount = XFS_B_TO_FSB(mp, fxr->length), + }; + struct xfs_trans *tp; + unsigned int qretry; + bool retried = false; + int error; + + trace_xfs_exchrange_mappings(fxr, ip1, ip2); + + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) + req.flags |= XFS_EXCHMAPS_SET_SIZES; + if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN) + req.flags |= XFS_EXCHMAPS_INO1_WRITTEN; + + /* + * Round the request length up to the nearest file allocation unit. + * The prep function already checked that the request offsets and + * length in @fxr are safe to round up. + */ + if (xfs_inode_has_bigrtalloc(ip2)) + req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount); + + error = xfs_exchrange_estimate(&req); + if (error) + return error; + +retry: + /* Allocate the transaction, lock the inodes, and join them. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0, + XFS_TRANS_RES_FDBLKS, &tp); + if (error) + return error; + + xfs_exchrange_ilock(tp, ip1, ip2); + + trace_xfs_exchrange_before(ip2, 2); + trace_xfs_exchrange_before(ip1, 1); + + error = xfs_exchmaps_check_forks(mp, &req); + if (error) + goto out_trans_cancel; + + /* + * Reserve ourselves some quota if any of them are in enforcing mode. + * In theory we only need enough to satisfy the change in the number + * of blocks between the two ranges being remapped. + */ + error = xfs_exchrange_reserve_quota(tp, &req, &qretry); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_exchrange_iunlock(ip1, ip2); + if (qretry & QRETRY_IP1) + xfs_blockgc_free_quota(ip1, 0); + if (qretry & QRETRY_IP2) + xfs_blockgc_free_quota(ip2, 0); + retried = true; + goto retry; + } + if (error) + goto out_trans_cancel; + + /* If we got this far on a dry run, all parameters are ok. */ + if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN) + goto out_trans_cancel; + + /* Update the mtime and ctime of both files. */ + if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1) + xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2) + xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + xfs_exchange_mappings(tp, &req); + + /* + * Force the log to persist metadata updates if the caller or the + * administrator requires this. The generic prep function already + * flushed the relevant parts of the page cache. + */ + if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp); + + trace_xfs_exchrange_after(ip2, 2); + trace_xfs_exchrange_after(ip1, 1); + + if (error) + goto out_unlock; + + /* + * If the caller wanted us to exchange the contents of two complete + * files of unequal length, exchange the incore sizes now. This should + * be safe because we flushed both files' page caches, exchanged all + * the mappings, and updated the ondisk sizes. + */ + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { + loff_t temp; + + temp = i_size_read(VFS_I(ip2)); + i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1))); + i_size_write(VFS_I(ip1), temp); + } + +out_unlock: + xfs_exchrange_iunlock(ip1, ip2); + return error; + +out_trans_cancel: + xfs_trans_cancel(tp); + goto out_unlock; +} + +/* + * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE. + * This part deals with struct file objects and byte ranges and does not deal + * with XFS-specific data structures such as xfs_inodes and block ranges. This + * separation may some day facilitate porting to another filesystem. + * + * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in + * file1 with the same number of bytes starting at fxr.file2_offset in file2. + * Implementations must call xfs_exchange_range_prep to prepare the two + * files prior to taking locks; and they must update the inode change and mod + * times of both files as part of the metadata update. The timestamp update + * and freshness checks must be done atomically as part of the data exchange + * operation to ensure correctness of the freshness check. + * xfs_exchange_range_finish must be called after the operation completes + * successfully but before locks are dropped. + */ + +/* Verify that we have security clearance to perform this operation. */ +static int +xfs_exchange_range_verify_area( + struct xfs_exchrange *fxr) +{ + int ret; + + ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, + true); + if (ret) + return ret; + + return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, + true); +} + +/* + * Performs necessary checks before doing a range exchange, having stabilized + * mutable inode attributes via i_rwsem. + */ +static inline int +xfs_exchange_range_checks( + struct xfs_exchrange *fxr, + unsigned int alloc_unit) +{ + struct inode *inode1 = file_inode(fxr->file1); + struct inode *inode2 = file_inode(fxr->file2); + uint64_t allocmask = alloc_unit - 1; + int64_t test_len; + uint64_t blen; + loff_t size1, size2, tmp; + int error; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2)) + return -EPERM; + if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) + return -ETXTBSY; + + size1 = i_size_read(inode1); + size2 = i_size_read(inode2); + + /* Ranges cannot start after EOF. */ + if (fxr->file1_offset > size1 || fxr->file2_offset > size2) + return -EINVAL; + + /* + * If the caller said to exchange to EOF, we set the length of the + * request large enough to cover everything to the end of both files. + */ + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { + fxr->length = max_t(int64_t, size1 - fxr->file1_offset, + size2 - fxr->file2_offset); + + error = xfs_exchange_range_verify_area(fxr); + if (error) + return error; + } + + /* + * The start of both ranges must be aligned to the file allocation + * unit. + */ + if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) || + !IS_ALIGNED(fxr->file2_offset, alloc_unit)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) || + check_add_overflow(fxr->file2_offset, fxr->length, &tmp)) + return -EINVAL; + + /* + * We require both ranges to end within EOF, unless we're exchanging + * to EOF. + */ + if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && + (fxr->file1_offset + fxr->length > size1 || + fxr->file2_offset + fxr->length > size2)) + return -EINVAL; + + /* + * Make sure we don't hit any file size limits. If we hit any size + * limits such that test_length was adjusted, we abort the whole + * operation. + */ + test_len = fxr->length; + error = generic_write_check_limits(fxr->file2, fxr->file2_offset, + &test_len); + if (error) + return error; + error = generic_write_check_limits(fxr->file1, fxr->file1_offset, + &test_len); + if (error) + return error; + if (test_len != fxr->length) + return -EINVAL; + + /* + * If the user wanted us to exchange up to the infile's EOF, round up + * to the next allocation unit boundary for this check. Do the same + * for the outfile. + * + * Otherwise, reject the range length if it's not aligned to an + * allocation unit. + */ + if (fxr->file1_offset + fxr->length == size1) + blen = ALIGN(size1, alloc_unit) - fxr->file1_offset; + else if (fxr->file2_offset + fxr->length == size2) + blen = ALIGN(size2, alloc_unit) - fxr->file2_offset; + else if (!IS_ALIGNED(fxr->length, alloc_unit)) + return -EINVAL; + else + blen = fxr->length; + + /* Don't allow overlapped exchanges within the same file. */ + if (inode1 == inode2 && + fxr->file2_offset + blen > fxr->file1_offset && + fxr->file1_offset + blen > fxr->file2_offset) + return -EINVAL; + + /* + * Ensure that we don't exchange a partial EOF block into the middle of + * another file. + */ + if ((fxr->length & allocmask) == 0) + return 0; + + blen = fxr->length; + if (fxr->file2_offset + blen < size2) + blen &= ~allocmask; + + if (fxr->file1_offset + blen < size1) + blen &= ~allocmask; + + return blen == fxr->length ? 0 : -EINVAL; +} + +/* + * Check that the two inodes are eligible for range exchanges, the ranges make + * sense, and then flush all dirty data. Caller must ensure that the inodes + * have been locked against any other modifications. + */ +static inline int +xfs_exchange_range_prep( + struct xfs_exchrange *fxr, + unsigned int alloc_unit) +{ + struct inode *inode1 = file_inode(fxr->file1); + struct inode *inode2 = file_inode(fxr->file2); + bool same_inode = (inode1 == inode2); + int error; + + /* Check that we don't violate system file offset limits. */ + error = xfs_exchange_range_checks(fxr, alloc_unit); + if (error || fxr->length == 0) + return error; + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode1); + if (!same_inode) + inode_dio_wait(inode2); + + error = filemap_write_and_wait_range(inode1->i_mapping, + fxr->file1_offset, + fxr->file1_offset + fxr->length - 1); + if (error) + return error; + + error = filemap_write_and_wait_range(inode2->i_mapping, + fxr->file2_offset, + fxr->file2_offset + fxr->length - 1); + if (error) + return error; + + /* + * If the files or inodes involved require synchronous writes, amend + * the request to force the filesystem to flush all data and metadata + * to disk after the operation completes. + */ + if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) || + IS_SYNC(inode1) || IS_SYNC(inode2)) + fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC; + + return 0; +} + +/* + * Finish a range exchange operation, if it was successful. Caller must ensure + * that the inodes are still locked against any other modifications. + */ +static inline int +xfs_exchange_range_finish( + struct xfs_exchrange *fxr) +{ + int error; + + error = file_remove_privs(fxr->file1); + if (error) + return error; + if (file_inode(fxr->file1) == file_inode(fxr->file2)) + return 0; + + return file_remove_privs(fxr->file2); +} + +/* + * Check the alignment of an exchange request when the allocation unit size + * isn't a power of two. The generic file-level helpers use (fast) + * bitmask-based alignment checks, but here we have to use slow long division. + */ +static int +xfs_exchrange_check_rtalign( + const struct xfs_exchrange *fxr, + struct xfs_inode *ip1, + struct xfs_inode *ip2, + unsigned int alloc_unit) +{ + uint64_t length = fxr->length; + uint64_t blen; + loff_t size1, size2; + + size1 = i_size_read(VFS_I(ip1)); + size2 = i_size_read(VFS_I(ip2)); + + /* The start of both ranges must be aligned to a rt extent. */ + if (!isaligned_64(fxr->file1_offset, alloc_unit) || + !isaligned_64(fxr->file2_offset, alloc_unit)) + return -EINVAL; + + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) + length = max_t(int64_t, size1 - fxr->file1_offset, + size2 - fxr->file2_offset); + + /* + * If the user wanted us to exchange up to the infile's EOF, round up + * to the next rt extent boundary for this check. Do the same for the + * outfile. + * + * Otherwise, reject the range length if it's not rt extent aligned. + * We already confirmed the starting offsets' rt extent block + * alignment. + */ + if (fxr->file1_offset + length == size1) + blen = roundup_64(size1, alloc_unit) - fxr->file1_offset; + else if (fxr->file2_offset + length == size2) + blen = roundup_64(size2, alloc_unit) - fxr->file2_offset; + else if (!isaligned_64(length, alloc_unit)) + return -EINVAL; + else + blen = length; + + /* Don't allow overlapped exchanges within the same file. */ + if (ip1 == ip2 && + fxr->file2_offset + blen > fxr->file1_offset && + fxr->file1_offset + blen > fxr->file2_offset) + return -EINVAL; + + /* + * Ensure that we don't exchange a partial EOF rt extent into the + * middle of another file. + */ + if (isaligned_64(length, alloc_unit)) + return 0; + + blen = length; + if (fxr->file2_offset + length < size2) + blen = rounddown_64(blen, alloc_unit); + + if (fxr->file1_offset + blen < size1) + blen = rounddown_64(blen, alloc_unit); + + return blen == length ? 0 : -EINVAL; +} + +/* Prepare two files to have their data exchanged. */ +STATIC int +xfs_exchrange_prep( + struct xfs_exchrange *fxr, + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + struct xfs_mount *mp = ip2->i_mount; + unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2); + int error; + + trace_xfs_exchrange_prep(fxr, ip1, ip2); + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2)) + return -EINVAL; + + /* Check non-power of two alignment issues, if necessary. */ + if (!is_power_of_2(alloc_unit)) { + error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit); + if (error) + return error; + + /* + * Do the generic file-level checks with the regular block + * alignment. + */ + alloc_unit = mp->m_sb.sb_blocksize; + } + + error = xfs_exchange_range_prep(fxr, alloc_unit); + if (error || fxr->length == 0) + return error; + + /* Attach dquots to both inodes before changing block maps. */ + error = xfs_qm_dqattach(ip2); + if (error) + return error; + error = xfs_qm_dqattach(ip1); + if (error) + return error; + + trace_xfs_exchrange_flush(fxr, ip1, ip2); + + /* Flush the relevant ranges of both files. */ + error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length); + if (error) + return error; + error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length); + if (error) + return error; + + /* + * Cancel CoW fork preallocations for the ranges of both files. The + * prep function should have flushed all the dirty data, so the only + * CoW mappings remaining should be speculative. + */ + if (xfs_inode_has_cow_data(ip1)) { + error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset, + fxr->length, true); + if (error) + return error; + } + + if (xfs_inode_has_cow_data(ip2)) { + error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset, + fxr->length, true); + if (error) + return error; + } + + return 0; +} + +/* + * Exchange contents of files. This is the binding between the generic + * file-level concepts and the XFS inode-specific implementation. + */ +STATIC int +xfs_exchrange_contents( + struct xfs_exchrange *fxr) +{ + struct inode *inode1 = file_inode(fxr->file1); + struct inode *inode2 = file_inode(fxr->file2); + struct xfs_inode *ip1 = XFS_I(inode1); + struct xfs_inode *ip2 = XFS_I(inode2); + struct xfs_mount *mp = ip1->i_mount; + int error; + + if (!xfs_has_exchange_range(mp)) + return -EOPNOTSUPP; + + if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | + XFS_EXCHANGE_RANGE_PRIV_FLAGS)) + return -EINVAL; + + if (xfs_is_shutdown(mp)) + return -EIO; + + /* Lock both files against IO */ + error = xfs_ilock2_io_mmap(ip1, ip2); + if (error) + goto out_err; + + /* Prepare and then exchange file contents. */ + error = xfs_exchrange_prep(fxr, ip1, ip2); + if (error) + goto out_unlock; + + error = xfs_exchrange_mappings(fxr, ip1, ip2); + if (error) + goto out_unlock; + + /* + * Finish the exchange by removing special file privileges like any + * other file write would do. This may involve turning on support for + * logged xattrs if either file has security capabilities. + */ + error = xfs_exchange_range_finish(fxr); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock2_io_mmap(ip1, ip2); +out_err: + if (error) + trace_xfs_exchrange_error(ip2, error, _RET_IP_); + return error; +} + +/* Exchange parts of two files. */ +static int +xfs_exchange_range( + struct xfs_exchrange *fxr) +{ + struct inode *inode1 = file_inode(fxr->file1); + struct inode *inode2 = file_inode(fxr->file2); + int ret; + + BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & + XFS_EXCHANGE_RANGE_PRIV_FLAGS); + + /* Both files must be on the same mount/filesystem. */ + if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) + return -EXDEV; + + if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) + return -EINVAL; + + /* Userspace requests only honored for regular files. */ + if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode)) + return -EISDIR; + if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) + return -EINVAL; + + /* Both files must be opened for read and write. */ + if (!(fxr->file1->f_mode & FMODE_READ) || + !(fxr->file1->f_mode & FMODE_WRITE) || + !(fxr->file2->f_mode & FMODE_READ) || + !(fxr->file2->f_mode & FMODE_WRITE)) + return -EBADF; + + /* Neither file can be opened append-only. */ + if ((fxr->file1->f_flags & O_APPEND) || + (fxr->file2->f_flags & O_APPEND)) + return -EBADF; + + /* + * If we're not exchanging to EOF, we can check the areas before + * stabilizing both files' i_size. + */ + if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { + ret = xfs_exchange_range_verify_area(fxr); + if (ret) + return ret; + } + + /* Update cmtime if the fd/inode don't forbid it. */ + if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) + fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1; + if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)) + fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2; + + file_start_write(fxr->file2); + ret = xfs_exchrange_contents(fxr); + file_end_write(fxr->file2); + if (ret) + return ret; + + fsnotify_modify(fxr->file1); + if (fxr->file2 != fxr->file1) + fsnotify_modify(fxr->file2); + return 0; +} + +/* Collect exchange-range arguments from userspace. */ +long +xfs_ioc_exchange_range( + struct file *file, + struct xfs_exchange_range __user *argp) +{ + struct xfs_exchrange fxr = { + .file2 = file, + }; + struct xfs_exchange_range args; + struct fd file1; + int error; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + if (memchr_inv(&args.pad, 0, sizeof(args.pad))) + return -EINVAL; + if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) + return -EINVAL; + + fxr.file1_offset = args.file1_offset; + fxr.file2_offset = args.file2_offset; + fxr.length = args.length; + fxr.flags = args.flags; + + file1 = fdget(args.file1_fd); + if (!file1.file) + return -EBADF; + fxr.file1 = file1.file; + + error = xfs_exchange_range(&fxr); + fdput(file1); + return error; +} diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h new file mode 100644 index 000000000000..039abcca546e --- /dev/null +++ b/fs/xfs/xfs_exchrange.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_EXCHRANGE_H__ +#define __XFS_EXCHRANGE_H__ + +/* Update the mtime/cmtime of file1 and file2 */ +#define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63) +#define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62) + +#define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \ + __XFS_EXCHANGE_RANGE_UPD_CMTIME2) + +struct xfs_exchrange { + struct file *file1; + struct file *file2; + + loff_t file1_offset; + loff_t file2_offset; + u64 length; + + u64 flags; /* XFS_EXCHANGE_RANGE flags */ +}; + +long xfs_ioc_exchange_range(struct file *file, + struct xfs_exchange_range __user *argp); + +struct xfs_exchmaps_req; + +void xfs_exchrange_ilock(struct xfs_trans *tp, struct xfs_inode *ip1, + struct xfs_inode *ip2); +void xfs_exchrange_iunlock(struct xfs_inode *ip1, struct xfs_inode *ip2); + +int xfs_exchrange_estimate(struct xfs_exchmaps_req *req); + +#endif /* __XFS_EXCHRANGE_H__ */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 7cd09c3a82cb..201489d3de08 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -102,7 +102,7 @@ xfs_fs_encode_fh( return fileid_type; } -STATIC struct inode * +struct inode * xfs_nfs_get_inode( struct super_block *sb, u64 ino, @@ -160,7 +160,7 @@ xfs_nfs_get_inode( } } - if (VFS_I(ip)->i_generation != generation) { + if (VFS_I(ip)->i_generation != generation || IS_PRIVATE(VFS_I(ip))) { xfs_irele(ip); return ERR_PTR(-ESTALE); } diff --git a/fs/xfs/xfs_export.h b/fs/xfs/xfs_export.h index 64471a3ddb04..3cd85e8901a5 100644 --- a/fs/xfs/xfs_export.h +++ b/fs/xfs/xfs_export.h @@ -57,4 +57,6 @@ struct xfs_fid64 { /* This flag goes on the wire. Don't play with it. */ #define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ +struct inode *xfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 gen); + #endif /* __XFS_EXPORT_H__ */ diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 56cfa1498571..a73e7c73b664 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -518,35 +518,26 @@ fail: goto out; } -STATIC void +static bool xfs_extent_busy_clear_one( - struct xfs_mount *mp, struct xfs_perag *pag, - struct xfs_extent_busy *busyp) + struct xfs_extent_busy *busyp, + bool do_discard) { if (busyp->length) { - trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, - busyp->length); + if (do_discard && + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { + busyp->flags = XFS_EXTENT_BUSY_DISCARDED; + return false; + } + trace_xfs_extent_busy_clear(pag->pag_mount, busyp->agno, + busyp->bno, busyp->length); rb_erase(&busyp->rb_node, &pag->pagb_tree); } list_del_init(&busyp->list); kfree(busyp); -} - -static void -xfs_extent_busy_put_pag( - struct xfs_perag *pag, - bool wakeup) - __releases(pag->pagb_lock) -{ - if (wakeup) { - pag->pagb_gen++; - wake_up_all(&pag->pagb_wait); - } - - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); + return true; } /* @@ -560,32 +551,33 @@ xfs_extent_busy_clear( struct list_head *list, bool do_discard) { - struct xfs_extent_busy *busyp, *n; - struct xfs_perag *pag = NULL; - xfs_agnumber_t agno = NULLAGNUMBER; - bool wakeup = false; - - list_for_each_entry_safe(busyp, n, list, list) { - if (busyp->agno != agno) { - if (pag) - xfs_extent_busy_put_pag(pag, wakeup); - agno = busyp->agno; - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); - wakeup = false; - } + struct xfs_extent_busy *busyp, *next; - if (do_discard && busyp->length && - !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { - busyp->flags = XFS_EXTENT_BUSY_DISCARDED; - } else { - xfs_extent_busy_clear_one(mp, pag, busyp); - wakeup = true; - } - } + busyp = list_first_entry_or_null(list, typeof(*busyp), list); + if (!busyp) + return; - if (pag) - xfs_extent_busy_put_pag(pag, wakeup); + do { + bool wakeup = false; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + do { + next = list_next_entry(busyp, list); + if (xfs_extent_busy_clear_one(pag, busyp, do_discard)) + wakeup = true; + busyp = next; + } while (!list_entry_is_head(busyp, list, list) && + busyp->agno == pag->pag_agno); + + if (wakeup) { + pag->pagb_gen++; + wake_up_all(&pag->pagb_wait); + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } while (!list_entry_is_head(busyp, list, list)); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2ce302b4885f..b240ea5241dc 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -24,6 +24,7 @@ #include "xfs_pnfs.h" #include "xfs_iomap.h" #include "xfs_reflink.h" +#include "xfs_file.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -38,33 +39,19 @@ static const struct vm_operations_struct xfs_file_vm_ops; * Decide if the given file range is aligned to the size of the fundamental * allocation unit for the file. */ -static bool +bool xfs_is_falloc_aligned( struct xfs_inode *ip, loff_t pos, long long int len) { - struct xfs_mount *mp = ip->i_mount; - uint64_t mask; - - if (XFS_IS_REALTIME_INODE(ip)) { - if (!is_power_of_2(mp->m_sb.sb_rextsize)) { - u64 rextbytes; - u32 mod; - - rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); - div_u64_rem(pos, rextbytes, &mod); - if (mod) - return false; - div_u64_rem(len, rextbytes, &mod); - return mod == 0; - } - mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; - } else { - mask = mp->m_sb.sb_blocksize - 1; - } + unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); + + if (!is_power_of_2(alloc_unit)) + return isaligned_64(pos, alloc_unit) && + isaligned_64(len, alloc_unit); - return !((pos | len) & mask); + return !((pos | len) & (alloc_unit - 1)); } /* @@ -861,67 +848,6 @@ xfs_file_write_iter( return xfs_file_buffered_write(iocb, from); } -static void -xfs_wait_dax_page( - struct inode *inode) -{ - struct xfs_inode *ip = XFS_I(inode); - - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); - schedule(); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); -} - -int -xfs_break_dax_layouts( - struct inode *inode, - bool *retry) -{ - struct page *page; - - xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); - - page = dax_layout_busy_page(inode->i_mapping); - if (!page) - return 0; - - *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, xfs_wait_dax_page(inode)); -} - -int -xfs_break_layouts( - struct inode *inode, - uint *iolock, - enum layout_break_reason reason) -{ - bool retry; - int error; - - xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); - - do { - retry = false; - switch (reason) { - case BREAK_UNMAP: - error = xfs_break_dax_layouts(inode, &retry); - if (error || retry) - break; - fallthrough; - case BREAK_WRITE: - error = xfs_break_leased_layouts(inode, iolock, &retry); - break; - default: - WARN_ON_ONCE(1); - error = -EINVAL; - } - } while (error == 0 && retry); - - return error; -} - /* Does this file, inode, or mount want synchronous writes? */ static inline bool xfs_file_sync_writes(struct file *filp) { diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h new file mode 100644 index 000000000000..2ad91f755caf --- /dev/null +++ b/fs/xfs/xfs_file.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_FILE_H__ +#define __XFS_FILE_H__ + +extern const struct file_operations xfs_file_operations; +extern const struct file_operations xfs_dir_file_operations; + +bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos, + long long int len); + +#endif /* __XFS_FILE_H__ */ diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index de59eec74765..85dbb46452ca 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -533,7 +533,7 @@ xfs_getfsmap_rtdev_rtbitmap( trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb); trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb); - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); /* * Set up query parameters to return free rtextents covering the range @@ -557,7 +557,7 @@ xfs_getfsmap_rtdev_rtbitmap( if (error) goto err; err: - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); return error; } #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 83f708f62ed9..c211ea2b63c4 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -213,10 +213,8 @@ xfs_growfs_data_private( struct xfs_perag *pag; pag = xfs_perag_get(mp, id.agno); - error = xfs_ag_resv_free(pag); + xfs_ag_resv_free(pag); xfs_perag_put(pag); - if (error) - return error; } /* * Reserve AG metadata blocks. ENOSPC here does not mean there @@ -385,14 +383,14 @@ xfs_reserve_blocks( */ if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; - if (lcounter > 0) { /* release unused blocks */ + if (lcounter > 0) { /* release unused blocks */ fdblks_delta = lcounter; mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; if (fdblks_delta) { spin_unlock(&mp->m_sb_lock); - error = xfs_mod_fdblocks(mp, fdblks_delta, 0); + xfs_add_fdblocks(mp, fdblks_delta); spin_lock(&mp->m_sb_lock); } @@ -428,9 +426,9 @@ xfs_reserve_blocks( */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); - error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + error = xfs_dec_fdblocks(mp, fdblks_delta, 0); if (!error) - xfs_mod_fdblocks(mp, fdblks_delta, 0); + xfs_add_fdblocks(mp, fdblks_delta); spin_lock(&mp->m_sb_lock); } out: @@ -556,24 +554,13 @@ xfs_fs_reserve_ag_blocks( /* * Free space reserved for per-AG metadata. */ -int +void xfs_fs_unreserve_ag_blocks( struct xfs_mount *mp) { xfs_agnumber_t agno; struct xfs_perag *pag; - int error = 0; - int err2; - for_each_perag(mp, agno, pag) { - err2 = xfs_ag_resv_free(pag); - if (err2 && !error) - error = err2; - } - - if (error) - xfs_warn(mp, - "Error %d freeing per-AG metadata reserve pool.", error); - - return error; + for_each_perag(mp, agno, pag) + xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 44457b0a0593..3e2f73bcf831 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -12,6 +12,6 @@ int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); -int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); +void xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c new file mode 100644 index 000000000000..c8785ed59543 --- /dev/null +++ b/fs/xfs/xfs_handle.c @@ -0,0 +1,952 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2022-2024 Oracle. + * All rights reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_ioctl.h" +#include "xfs_parent.h" +#include "xfs_da_btree.h" +#include "xfs_handle.h" +#include "xfs_health.h" +#include "xfs_icache.h" +#include "xfs_export.h" +#include "xfs_xattr.h" +#include "xfs_acl.h" + +#include <linux/namei.h> + +static inline size_t +xfs_filehandle_fid_len(void) +{ + struct xfs_handle *handle = NULL; + + return sizeof(struct xfs_fid) - sizeof(handle->ha_fid.fid_len); +} + +static inline size_t +xfs_filehandle_init( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t gen, + struct xfs_handle *handle) +{ + memcpy(&handle->ha_fsid, mp->m_fixedfsid, sizeof(struct xfs_fsid)); + + handle->ha_fid.fid_len = xfs_filehandle_fid_len(); + handle->ha_fid.fid_pad = 0; + handle->ha_fid.fid_gen = gen; + handle->ha_fid.fid_ino = ino; + + return sizeof(struct xfs_handle); +} + +static inline size_t +xfs_fshandle_init( + struct xfs_mount *mp, + struct xfs_handle *handle) +{ + memcpy(&handle->ha_fsid, mp->m_fixedfsid, sizeof(struct xfs_fsid)); + memset(&handle->ha_fid, 0, sizeof(handle->ha_fid)); + + return sizeof(struct xfs_fsid); +} + +/* + * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to + * a file or fs handle. + * + * XFS_IOC_PATH_TO_FSHANDLE + * returns fs handle for a mount point or path within that mount point + * XFS_IOC_FD_TO_HANDLE + * returns full handle for a FD opened in user space + * XFS_IOC_PATH_TO_HANDLE + * returns full handle for a path + */ +int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq) +{ + int hsize; + xfs_handle_t handle; + struct inode *inode; + struct fd f = {NULL}; + struct path path; + int error; + struct xfs_inode *ip; + + if (cmd == XFS_IOC_FD_TO_HANDLE) { + f = fdget(hreq->fd); + if (!f.file) + return -EBADF; + inode = file_inode(f.file); + } else { + error = user_path_at(AT_FDCWD, hreq->path, 0, &path); + if (error) + return error; + inode = d_inode(path.dentry); + } + ip = XFS_I(inode); + + /* + * We can only generate handles for inodes residing on a XFS filesystem, + * and only for regular files, directories or symbolic links. + */ + error = -EINVAL; + if (inode->i_sb->s_magic != XFS_SB_MAGIC) + goto out_put; + + error = -EBADF; + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + goto out_put; + + + memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); + + if (cmd == XFS_IOC_PATH_TO_FSHANDLE) + hsize = xfs_fshandle_init(ip->i_mount, &handle); + else + hsize = xfs_filehandle_init(ip->i_mount, ip->i_ino, + inode->i_generation, &handle); + + error = -EFAULT; + if (copy_to_user(hreq->ohandle, &handle, hsize) || + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) + goto out_put; + + error = 0; + + out_put: + if (cmd == XFS_IOC_FD_TO_HANDLE) + fdput(f); + else + path_put(&path); + return error; +} + +/* + * No need to do permission checks on the various pathname components + * as the handle operations are privileged. + */ +STATIC int +xfs_handle_acceptable( + void *context, + struct dentry *dentry) +{ + return 1; +} + +/* Convert handle already copied to kernel space into a dentry. */ +static struct dentry * +xfs_khandle_to_dentry( + struct file *file, + struct xfs_handle *handle) +{ + struct xfs_fid64 fid = { + .ino = handle->ha_fid.fid_ino, + .gen = handle->ha_fid.fid_gen, + }; + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(file_inode(file)->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (handle->ha_fid.fid_len != xfs_filehandle_fid_len()) + return ERR_PTR(-EINVAL); + + return exportfs_decode_fh(file->f_path.mnt, (struct fid *)&fid, 3, + FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, + xfs_handle_acceptable, NULL); +} + +/* Convert handle already copied to kernel space into an xfs_inode. */ +static struct xfs_inode * +xfs_khandle_to_inode( + struct file *file, + struct xfs_handle *handle) +{ + struct xfs_inode *ip = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip->i_mount; + struct inode *inode; + + if (!S_ISDIR(VFS_I(ip)->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (handle->ha_fid.fid_len != xfs_filehandle_fid_len()) + return ERR_PTR(-EINVAL); + + inode = xfs_nfs_get_inode(mp->m_super, handle->ha_fid.fid_ino, + handle->ha_fid.fid_gen); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return XFS_I(inode); +} + +/* + * Convert userspace handle data into a dentry. + */ +struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen) +{ + xfs_handle_t handle; + + if (hlen != sizeof(xfs_handle_t)) + return ERR_PTR(-EINVAL); + if (copy_from_user(&handle, uhandle, hlen)) + return ERR_PTR(-EFAULT); + + return xfs_khandle_to_dentry(parfilp, &handle); +} + +STATIC struct dentry * +xfs_handlereq_to_dentry( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); +} + +int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + const struct cred *cred = current_cred(); + int error; + int fd; + int permflag; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + fmode_t fmode; + struct path path; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + inode = d_inode(dentry); + + /* Restrict xfs_open_by_handle to directories & regular files. */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + error = -EPERM; + goto out_dput; + } + +#if BITS_PER_LONG != 32 + hreq->oflags |= O_LARGEFILE; +#endif + + permflag = hreq->oflags; + fmode = OPEN_FMODE(permflag); + if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && + (fmode & FMODE_WRITE) && IS_APPEND(inode)) { + error = -EPERM; + goto out_dput; + } + + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + error = -EPERM; + goto out_dput; + } + + /* Can't write directories. */ + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { + error = -EISDIR; + goto out_dput; + } + + fd = get_unused_fd_flags(0); + if (fd < 0) { + error = fd; + goto out_dput; + } + + path.mnt = parfilp->f_path.mnt; + path.dentry = dentry; + filp = dentry_open(&path, hreq->oflags, cred); + dput(dentry); + if (IS_ERR(filp)) { + put_unused_fd(fd); + return PTR_ERR(filp); + } + + if (S_ISREG(inode->i_mode)) { + filp->f_flags |= O_NOATIME; + filp->f_mode |= FMODE_NOCMTIME; + } + + fd_install(fd, filp); + return fd; + + out_dput: + dput(dentry); + return error; +} + +int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + struct dentry *dentry; + __u32 olen; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + /* Restrict this handle operation to symlinks only. */ + if (!d_is_symlink(dentry)) { + error = -EINVAL; + goto out_dput; + } + + if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { + error = -EFAULT; + goto out_dput; + } + + error = vfs_readlink(dentry, hreq->ohandle, olen); + + out_dput: + dput(dentry); + return error; +} + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +static void +xfs_ioc_attr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + void *value, + int valuelen) +{ + struct xfs_attrlist *alist = context->buffer; + struct xfs_attrlist_ent *aep; + int arraytop; + + ASSERT(!context->seen_enough); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + + /* decrement by the actual bytes used by the attr */ + context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) + + namelen + 1, sizeof(uint32_t)); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return; + } + + aep = context->buffer + context->firstu; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); +} + +static unsigned int +xfs_attr_filter( + u32 ioc_flags) +{ + if (ioc_flags & XFS_IOC_ATTR_ROOT) + return XFS_ATTR_ROOT; + if (ioc_flags & XFS_IOC_ATTR_SECURE) + return XFS_ATTR_SECURE; + return 0; +} + +static inline enum xfs_attr_update +xfs_xattr_flags( + u32 ioc_flags, + void *value) +{ + if (!value) + return XFS_ATTRUPDATE_REMOVE; + if (ioc_flags & XFS_IOC_ATTR_CREATE) + return XFS_ATTRUPDATE_CREATE; + if (ioc_flags & XFS_IOC_ATTR_REPLACE) + return XFS_ATTRUPDATE_REPLACE; + return XFS_ATTRUPDATE_UPSERT; +} + +int +xfs_ioc_attr_list( + struct xfs_inode *dp, + void __user *ubuf, + size_t bufsize, + int flags, + struct xfs_attrlist_cursor __user *ucursor) +{ + struct xfs_attr_list_context context = { }; + struct xfs_attrlist *alist; + void *buffer; + int error; + + if (bufsize < sizeof(struct xfs_attrlist) || + bufsize > XFS_XATTR_LIST_MAX) + return -EINVAL; + + /* + * Reject flags, only allow namespaces. + */ + if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) + return -EINVAL; + if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) + return -EINVAL; + + /* + * Validate the cursor. + */ + if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor))) + return -EFAULT; + if (context.cursor.pad1 || context.cursor.pad2) + return -EINVAL; + if (!context.cursor.initted && + (context.cursor.hashval || context.cursor.blkno || + context.cursor.offset)) + return -EINVAL; + + buffer = kvzalloc(bufsize, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + /* + * Initialize the output buffer. + */ + context.dp = dp; + context.resynch = 1; + context.attr_filter = xfs_attr_filter(flags); + context.buffer = buffer; + context.bufsize = round_down(bufsize, sizeof(uint32_t)); + context.firstu = context.bufsize; + context.put_listent = xfs_ioc_attr_put_listent; + + alist = context.buffer; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list(&context); + if (error) + goto out_free; + + if (copy_to_user(ubuf, buffer, bufsize) || + copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) + error = -EFAULT; +out_free: + kvfree(buffer); + return error; +} + +int +xfs_attrlist_by_handle( + struct file *parfilp, + struct xfs_fsop_attrlist_handlereq __user *p) +{ + struct xfs_fsop_attrlist_handlereq al_hreq; + struct dentry *dentry; + int error = -ENOMEM; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) + return -EFAULT; + + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer, + al_hreq.buflen, al_hreq.flags, &p->pos); + dput(dentry); + return error; +} + +static int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + uint32_t *len, + uint32_t flags) +{ + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .name = name, + .namelen = strlen(name), + .valuelen = *len, + }; + int error; + + if (*len > XFS_XATTR_SIZE_MAX) + return -EINVAL; + + error = xfs_attr_get(&args); + if (error) + goto out_kfree; + + *len = args.valuelen; + if (copy_to_user(ubuf, args.value, args.valuelen)) + error = -EFAULT; + +out_kfree: + kvfree(args.value); + return error; +} + +static int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + uint32_t len, + uint32_t flags) +{ + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .name = name, + .namelen = strlen(name), + }; + int error; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + + if (ubuf) { + if (len > XFS_XATTR_SIZE_MAX) + return -EINVAL; + args.value = memdup_user(ubuf, len); + if (IS_ERR(args.value)) + return PTR_ERR(args.value); + args.valuelen = len; + } + + error = xfs_attr_change(&args, xfs_xattr_flags(flags, args.value)); + if (!error && (flags & XFS_IOC_ATTR_ROOT)) + xfs_forget_acl(inode, name); + kfree(args.value); + return error; +} + +int +xfs_ioc_attrmulti_one( + struct file *parfilp, + struct inode *inode, + uint32_t opcode, + void __user *uname, + void __user *value, + uint32_t *len, + uint32_t flags) +{ + unsigned char *name; + int error; + + if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE)) + return -EINVAL; + + name = strndup_user(uname, MAXNAMELEN); + if (IS_ERR(name)) + return PTR_ERR(name); + + switch (opcode) { + case ATTR_OP_GET: + error = xfs_attrmulti_attr_get(inode, name, value, len, flags); + break; + case ATTR_OP_REMOVE: + value = NULL; + *len = 0; + fallthrough; + case ATTR_OP_SET: + error = mnt_want_write_file(parfilp); + if (error) + break; + error = xfs_attrmulti_attr_set(inode, name, value, *len, flags); + mnt_drop_write_file(parfilp); + break; + default: + error = -EINVAL; + break; + } + + kfree(name); + return error; +} + +int +xfs_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + xfs_attr_multiop_t *ops; + xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) + return -EFAULT; + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -E2BIG; + size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(am_hreq.ops, size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); + goto out_dput; + } + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, + ops[i].am_attrname, ops[i].am_attrvalue, + &ops[i].am_length, ops[i].am_flags); + } + + if (copy_to_user(am_hreq.ops, ops, size)) + error = -EFAULT; + + kfree(ops); + out_dput: + dput(dentry); + return error; +} + +struct xfs_getparents_ctx { + struct xfs_attr_list_context context; + struct xfs_getparents_by_handle gph; + + /* File to target */ + struct xfs_inode *ip; + + /* Internal buffer where we format records */ + void *krecords; + + /* Last record filled out */ + struct xfs_getparents_rec *lastrec; + + unsigned int count; +}; + +static inline unsigned int +xfs_getparents_rec_sizeof( + unsigned int namelen) +{ + return round_up(sizeof(struct xfs_getparents_rec) + namelen + 1, + sizeof(uint64_t)); +} + +static void +xfs_getparents_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + void *value, + int valuelen) +{ + struct xfs_getparents_ctx *gpx = + container_of(context, struct xfs_getparents_ctx, context); + struct xfs_inode *ip = context->dp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_getparents *gp = &gpx->gph.gph_request; + struct xfs_getparents_rec *gpr = gpx->krecords + context->firstu; + unsigned short reclen = + xfs_getparents_rec_sizeof(namelen); + xfs_ino_t ino; + uint32_t gen; + int error; + + if (!(flags & XFS_ATTR_PARENT)) + return; + + error = xfs_parent_from_attr(mp, flags, name, namelen, value, valuelen, + &ino, &gen); + if (error) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_PARENT); + context->seen_enough = -EFSCORRUPTED; + return; + } + + /* + * We found a parent pointer, but we've filled up the buffer. Signal + * to the caller that we did /not/ reach the end of the parent pointer + * recordset. + */ + if (context->firstu > context->bufsize - reclen) { + context->seen_enough = 1; + return; + } + + /* Format the parent pointer directly into the caller buffer. */ + gpr->gpr_reclen = reclen; + xfs_filehandle_init(mp, ino, gen, &gpr->gpr_parent); + memcpy(gpr->gpr_name, name, namelen); + gpr->gpr_name[namelen] = 0; + + trace_xfs_getparents_put_listent(ip, gp, context, gpr); + + context->firstu += reclen; + gpx->count++; + gpx->lastrec = gpr; +} + +/* Expand the last record to fill the rest of the caller's buffer. */ +static inline void +xfs_getparents_expand_lastrec( + struct xfs_getparents_ctx *gpx) +{ + struct xfs_getparents *gp = &gpx->gph.gph_request; + struct xfs_getparents_rec *gpr = gpx->lastrec; + + if (!gpx->lastrec) + gpr = gpx->krecords; + + gpr->gpr_reclen = gp->gp_bufsize - ((void *)gpr - gpx->krecords); + + trace_xfs_getparents_expand_lastrec(gpx->ip, gp, &gpx->context, gpr); +} + +static inline void __user *u64_to_uptr(u64 val) +{ + return (void __user *)(uintptr_t)val; +} + +/* Retrieve the parent pointers for a given inode. */ +STATIC int +xfs_getparents( + struct xfs_getparents_ctx *gpx) +{ + struct xfs_getparents *gp = &gpx->gph.gph_request; + struct xfs_inode *ip = gpx->ip; + struct xfs_mount *mp = ip->i_mount; + size_t bufsize; + int error; + + /* Check size of buffer requested by user */ + if (gp->gp_bufsize > XFS_XATTR_LIST_MAX) + return -ENOMEM; + if (gp->gp_bufsize < xfs_getparents_rec_sizeof(1)) + return -EINVAL; + + if (gp->gp_iflags & ~XFS_GETPARENTS_IFLAGS_ALL) + return -EINVAL; + if (gp->gp_reserved) + return -EINVAL; + + bufsize = round_down(gp->gp_bufsize, sizeof(uint64_t)); + gpx->krecords = kvzalloc(bufsize, GFP_KERNEL); + if (!gpx->krecords) { + bufsize = min(bufsize, PAGE_SIZE); + gpx->krecords = kvzalloc(bufsize, GFP_KERNEL); + if (!gpx->krecords) + return -ENOMEM; + } + + gpx->context.dp = ip; + gpx->context.resynch = 1; + gpx->context.put_listent = xfs_getparents_put_listent; + gpx->context.bufsize = bufsize; + /* firstu is used to track the bytes filled in the buffer */ + gpx->context.firstu = 0; + + /* Copy the cursor provided by caller */ + memcpy(&gpx->context.cursor, &gp->gp_cursor, + sizeof(struct xfs_attrlist_cursor)); + gpx->count = 0; + gp->gp_oflags = 0; + + trace_xfs_getparents_begin(ip, gp, &gpx->context.cursor); + + error = xfs_attr_list(&gpx->context); + if (error) + goto out_free_buf; + if (gpx->context.seen_enough < 0) { + error = gpx->context.seen_enough; + goto out_free_buf; + } + xfs_getparents_expand_lastrec(gpx); + + /* Update the caller with the current cursor position */ + memcpy(&gp->gp_cursor, &gpx->context.cursor, + sizeof(struct xfs_attrlist_cursor)); + + /* Is this the root directory? */ + if (ip->i_ino == mp->m_sb.sb_rootino) + gp->gp_oflags |= XFS_GETPARENTS_OFLAG_ROOT; + + if (gpx->context.seen_enough == 0) { + /* + * If we did not run out of buffer space, then we reached the + * end of the pptr recordset, so set the DONE flag. + */ + gp->gp_oflags |= XFS_GETPARENTS_OFLAG_DONE; + } else if (gpx->count == 0) { + /* + * If we ran out of buffer space before copying any parent + * pointers at all, the caller's buffer was too short. Tell + * userspace that, erm, the message is too long. + */ + error = -EMSGSIZE; + goto out_free_buf; + } + + trace_xfs_getparents_end(ip, gp, &gpx->context.cursor); + + ASSERT(gpx->context.firstu <= gpx->gph.gph_request.gp_bufsize); + + /* Copy the records to userspace. */ + if (copy_to_user(u64_to_uptr(gpx->gph.gph_request.gp_buffer), + gpx->krecords, gpx->context.firstu)) + error = -EFAULT; + +out_free_buf: + kvfree(gpx->krecords); + gpx->krecords = NULL; + return error; +} + +/* Retrieve the parents of this file and pass them back to userspace. */ +int +xfs_ioc_getparents( + struct file *file, + struct xfs_getparents __user *ureq) +{ + struct xfs_getparents_ctx gpx = { + .ip = XFS_I(file_inode(file)), + }; + struct xfs_getparents *kreq = &gpx.gph.gph_request; + struct xfs_mount *mp = gpx.ip->i_mount; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!xfs_has_parent(mp)) + return -EOPNOTSUPP; + if (copy_from_user(kreq, ureq, sizeof(*kreq))) + return -EFAULT; + + error = xfs_getparents(&gpx); + if (error) + return error; + + if (copy_to_user(ureq, kreq, sizeof(*kreq))) + return -EFAULT; + + return 0; +} + +/* Retrieve the parents of this file handle and pass them back to userspace. */ +int +xfs_ioc_getparents_by_handle( + struct file *file, + struct xfs_getparents_by_handle __user *ureq) +{ + struct xfs_getparents_ctx gpx = { }; + struct xfs_inode *ip = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip->i_mount; + struct xfs_getparents_by_handle *kreq = &gpx.gph; + struct xfs_handle *handle = &kreq->gph_handle; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!xfs_has_parent(mp)) + return -EOPNOTSUPP; + if (copy_from_user(kreq, ureq, sizeof(*kreq))) + return -EFAULT; + + /* + * We don't use exportfs_decode_fh because it does too much work here. + * If the handle refers to a directory, the exportfs code will walk + * upwards through the directory tree to connect the dentries to the + * root directory dentry. For GETPARENTS we don't care about that + * because we're not actually going to open a file descriptor; we only + * want to open an inode and read its parent pointers. + * + * Note that xfs_scrub uses GETPARENTS to log that it will try to fix a + * corrupted file's metadata. For this usecase we would really rather + * userspace single-step the path reconstruction to avoid loops or + * other strange things if the directory tree is corrupt. + */ + gpx.ip = xfs_khandle_to_inode(file, handle); + if (IS_ERR(gpx.ip)) + return PTR_ERR(gpx.ip); + + error = xfs_getparents(&gpx); + if (error) + goto out_rele; + + if (copy_to_user(ureq, kreq, sizeof(*kreq))) + error = -EFAULT; + +out_rele: + xfs_irele(gpx.ip); + return error; +} diff --git a/fs/xfs/xfs_handle.h b/fs/xfs/xfs_handle.h new file mode 100644 index 000000000000..6799a86d8565 --- /dev/null +++ b/fs/xfs/xfs_handle.h @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2022-2024 Oracle. + * All rights reserved. + */ +#ifndef __XFS_HANDLE_H__ +#define __XFS_HANDLE_H__ + +int xfs_attrlist_by_handle(struct file *parfilp, + struct xfs_fsop_attrlist_handlereq __user *p); +int xfs_attrmulti_by_handle(struct file *parfilp, void __user *arg); + +int xfs_find_handle(unsigned int cmd, struct xfs_fsop_handlereq *hreq); +int xfs_open_by_handle(struct file *parfilp, struct xfs_fsop_handlereq *hreq); +int xfs_readlink_by_handle(struct file *parfilp, + struct xfs_fsop_handlereq *hreq); + +int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, + uint32_t opcode, void __user *uname, void __user *value, + uint32_t *len, uint32_t flags); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, + size_t bufsize, int flags, + struct xfs_attrlist_cursor __user *ucursor); + +struct dentry *xfs_handle_to_dentry(struct file *parfilp, void __user *uhandle, + u32 hlen); + +int xfs_ioc_getparents(struct file *file, struct xfs_getparents __user *arg); +int xfs_ioc_getparents_by_handle(struct file *file, + struct xfs_getparents_by_handle __user *arg); + +#endif /* __XFS_HANDLE_H__ */ diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index b39f959146bc..10f116d093a2 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -470,6 +470,7 @@ static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA }, { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, + { XFS_SICK_INO_DIRTREE, XFS_BS_SICK_DIRTREE }, { 0, 0 }, }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 74f1812b03cb..0953163a2d84 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -613,7 +613,6 @@ xfs_iget_cache_miss( struct xfs_inode *ip; int error; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - int iflags; ip = xfs_inode_alloc(mp, ino); if (!ip) @@ -693,13 +692,12 @@ xfs_iget_cache_miss( * memory barrier that ensures this detection works correctly at lookup * time. */ - iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) d_mark_dontcache(VFS_I(ip)); ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; - xfs_iflags_set(ip, iflags); + xfs_iflags_set(ip, XFS_INEW); /* insert the new inode */ spin_lock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d55b42b2480d..58fb7a5062e1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -16,6 +16,7 @@ #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_attr.h" +#include "xfs_bit.h" #include "xfs_trans_space.h" #include "xfs_trans.h" #include "xfs_buf_item.h" @@ -38,13 +39,12 @@ #include "xfs_ag.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_pnfs.h" +#include "xfs_parent.h" +#include "xfs_xattr.h" struct kmem_cache *xfs_inode_cache; -STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); -STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, - struct xfs_inode *); - /* * helper function to extract extent size hint from inode */ @@ -60,7 +60,8 @@ xfs_get_extsz_hint( return 0; if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) return ip->i_extsize; - if (XFS_IS_REALTIME_INODE(ip)) + if (XFS_IS_REALTIME_INODE(ip) && + ip->i_mount->m_sb.sb_rextsize > 1) return ip->i_mount->m_sb.sb_rextsize; return 0; } @@ -420,7 +421,7 @@ xfs_lock_inumorder( * lock more than one at a time, lockdep will report false positives saying we * have violated locking orders. */ -static void +void xfs_lock_inodes( struct xfs_inode **ips, int inodes, @@ -749,6 +750,8 @@ xfs_inode_inherit_flags2( /* * Initialise a newly allocated inode and return the in-core inode to the * caller locked exclusively. + * + * Caller is responsible for unlocking the inode manually upon return */ int xfs_init_new_inode( @@ -875,7 +878,7 @@ xfs_init_new_inode( /* * Log the new values stuffed into the inode. */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, flags); /* now that we have an i_mode we can setup the inode structure */ @@ -890,24 +893,27 @@ xfs_init_new_inode( * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). */ -static int /* error */ +int xfs_droplink( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - if (VFS_I(ip)->i_nlink == 0) { - xfs_alert(ip->i_mount, - "%s: Attempt to drop inode (%llu) with nlink zero.", - __func__, ip->i_ino); - return -EFSCORRUPTED; - } + struct inode *inode = VFS_I(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - drop_nlink(VFS_I(ip)); + if (inode->i_nlink == 0) { + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count dropped below zero. Pinning link count.", + ip->i_ino); + set_nlink(inode, XFS_NLINK_PINNED); + } + if (inode->i_nlink != XFS_NLINK_PINNED) + drop_nlink(inode); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (VFS_I(ip)->i_nlink) + if (inode->i_nlink) return 0; return xfs_iunlink(tp, ip); @@ -916,14 +922,22 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -static void +void xfs_bumplink( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { + struct inode *inode = VFS_I(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - inc_nlink(VFS_I(ip)); + if (inode->i_nlink == XFS_NLINK_PINNED - 1) + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count exceeded maximum. Pinning link count.", + ip->i_ino); + if (inode->i_nlink != XFS_NLINK_PINNED) + inc_nlink(inode); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -1005,7 +1019,7 @@ xfs_dir_hook_setup( int xfs_create( struct mnt_idmap *idmap, - xfs_inode_t *dp, + struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, @@ -1017,7 +1031,7 @@ xfs_create( struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - bool unlock_dp_on_error = false; + bool unlock_dp_on_error = false; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1025,6 +1039,7 @@ xfs_create( struct xfs_trans_res *tres; uint resblks; xfs_ino_t ino; + struct xfs_parent_args *ppargs; trace_xfs_create(dp, name); @@ -1046,13 +1061,17 @@ xfs_create( return error; if (is_dir) { - resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + resblks = xfs_mkdir_space_res(mp, name->len); tres = &M_RES(mp)->tr_mkdir; } else { - resblks = XFS_CREATE_SPACE_RES(mp, name->len); + resblks = xfs_create_space_res(mp, name->len); tres = &M_RES(mp)->tr_create; } + error = xfs_parent_start(mp, &ppargs); + if (error) + goto out_release_dquots; + /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not @@ -1068,7 +1087,7 @@ xfs_create( resblks, &tp); } if (error) - goto out_release_dquots; + goto out_parent; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -1092,8 +1111,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = false; + xfs_trans_ijoin(tp, dp, 0); error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks - XFS_IALLOC_SPACE_RES(mp)); @@ -1113,6 +1131,16 @@ xfs_create( } /* + * If we have parent pointers, we need to add the attribute containing + * the parent information now. + */ + if (ppargs) { + error = xfs_parent_addname(tp, ppargs, dp, name, ip); + if (error) + goto out_trans_cancel; + } + + /* * Create ip with a reference from dp, and add '.' and '..' references * if it's a directory. */ @@ -1142,6 +1170,9 @@ xfs_create( xfs_qm_dqrele(pdqp); *ipp = ip; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_parent_finish(mp, ppargs); return 0; out_trans_cancel: @@ -1153,9 +1184,12 @@ xfs_create( * transactions and deadlocks from xfs_inactive. */ if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_finish_inode_setup(ip); xfs_irele(ip); } + out_parent: + xfs_parent_finish(mp, ppargs); out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1171,6 +1205,7 @@ xfs_create_tmpfile( struct mnt_idmap *idmap, struct xfs_inode *dp, umode_t mode, + bool init_xattrs, struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; @@ -1211,7 +1246,7 @@ xfs_create_tmpfile( error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) error = xfs_init_new_inode(idmap, tp, dp, ino, mode, - 0, 0, prid, false, &ip); + 0, 0, prid, init_xattrs, &ip); if (error) goto out_trans_cancel; @@ -1238,6 +1273,7 @@ xfs_create_tmpfile( xfs_qm_dqrele(pdqp); *ipp = ip; + xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; out_trans_cancel: @@ -1249,6 +1285,7 @@ xfs_create_tmpfile( * transactions and deadlocks from xfs_inactive. */ if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_finish_inode_setup(ip); xfs_irele(ip); } @@ -1262,14 +1299,15 @@ xfs_create_tmpfile( int xfs_link( - xfs_inode_t *tdp, - xfs_inode_t *sip, + struct xfs_inode *tdp, + struct xfs_inode *sip, struct xfs_name *target_name) { - xfs_mount_t *mp = tdp->i_mount; - xfs_trans_t *tp; + struct xfs_mount *mp = tdp->i_mount; + struct xfs_trans *tp; int error, nospace_error = 0; int resblks; + struct xfs_parent_args *ppargs; trace_xfs_link(tdp, target_name); @@ -1288,11 +1326,25 @@ xfs_link( if (error) goto std_return; - resblks = XFS_LINK_SPACE_RES(mp, target_name->len); + error = xfs_parent_start(mp, &ppargs); + if (error) + goto std_return; + + resblks = xfs_link_space_res(mp, target_name->len); error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, &tp, &nospace_error); if (error) - goto std_return; + goto out_parent; + + /* + * We don't allow reservationless or quotaless hardlinking when parent + * pointers are enabled because we can't back out if the xattrs must + * grow. + */ + if (ppargs && nospace_error) { + error = nospace_error; + goto error_return; + } /* * If we are using project inheritance, we only allow hard link @@ -1343,6 +1395,19 @@ xfs_link( xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); xfs_bumplink(tp, sip); + + /* + * If we have parent pointers, we now need to add the parent record to + * the attribute fork of the inode. If this is the initial parent + * attribute, we need to create it correctly, otherwise we can just add + * the parent to the inode. + */ + if (ppargs) { + error = xfs_parent_addname(tp, ppargs, tdp, target_name, sip); + if (error) + goto error_return; + } + xfs_dir_update_hook(tdp, sip, 1, target_name); /* @@ -1353,10 +1418,18 @@ xfs_link( if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); + error = xfs_trans_commit(tp); + xfs_iunlock(tdp, XFS_ILOCK_EXCL); + xfs_iunlock(sip, XFS_ILOCK_EXCL); + xfs_parent_finish(mp, ppargs); + return error; error_return: xfs_trans_cancel(tp); + xfs_iunlock(tdp, XFS_ILOCK_EXCL); + xfs_iunlock(sip, XFS_ILOCK_EXCL); + out_parent: + xfs_parent_finish(mp, ppargs); std_return: if (error == -ENOSPC && nospace_error) error = nospace_error; @@ -1555,6 +1628,51 @@ out_unlock: } /* + * Mark all the buffers attached to this directory stale. In theory we should + * never be freeing a directory with any blocks at all, but this covers the + * case where we've recovered a directory swap with a "temporary" directory + * created by online repair and now need to dump it. + */ +STATIC void +xfs_inactive_dir( + struct xfs_inode *dp) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); + xfs_fileoff_t off; + + /* + * Invalidate each directory block. All directory blocks are of + * fsbcount length and alignment, so we only need to walk those same + * offsets. We hold the only reference to this inode, so we must wait + * for the buffer locks. + */ + for_each_xfs_iext(ifp, &icur, &got) { + for (off = round_up(got.br_startoff, geo->fsbcount); + off < got.br_startoff + got.br_blockcount; + off += geo->fsbcount) { + struct xfs_buf *bp = NULL; + xfs_fsblock_t fsbno; + int error; + + fsbno = (off - got.br_startoff) + got.br_startblock; + error = xfs_buf_incore(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsbno), + XFS_FSB_TO_BB(mp, geo->fsbcount), + XBF_LIVESCAN, &bp); + if (error) + continue; + + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + } +} + +/* * xfs_inactive_truncate * * Called to perform a truncate when an inode becomes unlinked. @@ -1864,6 +1982,11 @@ xfs_inactive( goto out; } + if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_df.if_nextents > 0) { + xfs_inactive_dir(ip); + truncate = 1; + } + if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); else if (truncate) @@ -1937,7 +2060,7 @@ out: * only unlinked, referenced inodes can be on the unlinked inode list. If we * don't find the inode in cache, then let the caller handle the situation. */ -static struct xfs_inode * +struct xfs_inode * xfs_iunlink_lookup( struct xfs_perag *pag, xfs_agino_t agino) @@ -2150,7 +2273,7 @@ xfs_iunlink_insert_inode( * We place the on-disk inode on a list in the AGI. It will be pulled from this * list when the inode is freed. */ -STATIC int +int xfs_iunlink( struct xfs_trans *tp, struct xfs_inode *ip) @@ -2167,7 +2290,7 @@ xfs_iunlink( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, 0, &agibp); if (error) goto out; @@ -2252,7 +2375,7 @@ xfs_iunlink_remove_inode( /* * Pull the on-disk inode from the AGI unlinked list. */ -STATIC int +int xfs_iunlink_remove( struct xfs_trans *tp, struct xfs_perag *pag, @@ -2264,7 +2387,7 @@ xfs_iunlink_remove( trace_xfs_iunlink_remove(ip); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, 0, &agibp); if (error) return error; @@ -2598,16 +2721,17 @@ xfs_iunpin_wait( */ int xfs_remove( - xfs_inode_t *dp, + struct xfs_inode *dp, struct xfs_name *name, - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_mount_t *mp = dp->i_mount; - xfs_trans_t *tp = NULL; + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = NULL; int is_dir = S_ISDIR(VFS_I(ip)->i_mode); int dontcare; int error = 0; uint resblks; + struct xfs_parent_args *ppargs; trace_xfs_remove(dp, name); @@ -2624,6 +2748,10 @@ xfs_remove( if (error) goto std_return; + error = xfs_parent_start(mp, &ppargs); + if (error) + goto std_return; + /* * We try to get the real space reservation first, allowing for * directory btree deletion(s) implying possible bmap insert(s). If we @@ -2635,12 +2763,12 @@ xfs_remove( * the directory code can handle a reservationless update and we don't * want to prevent a user from trying to free space by deleting things. */ - resblks = XFS_REMOVE_SPACE_RES(mp); + resblks = xfs_remove_space_res(mp, name->len); error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, &tp, &dontcare); if (error) { ASSERT(error != -ENOSPC); - goto std_return; + goto out_parent; } /* @@ -2700,6 +2828,13 @@ xfs_remove( goto out_trans_cancel; } + /* Remove parent pointer. */ + if (ppargs) { + error = xfs_parent_removename(tp, ppargs, dp, name, ip); + if (error) + goto out_trans_cancel; + } + /* * Drop the link from dp to ip, and if ip was a directory, remove the * '.' and '..' references since we freed the directory. @@ -2716,19 +2851,42 @@ xfs_remove( error = xfs_trans_commit(tp); if (error) - goto std_return; + goto out_unlock; if (is_dir && xfs_inode_is_filestream(ip)) xfs_filestream_deassociate(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_parent_finish(mp, ppargs); return 0; out_trans_cancel: xfs_trans_cancel(tp); + out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + out_parent: + xfs_parent_finish(mp, ppargs); std_return: return error; } +static inline void +xfs_iunlock_rename( + struct xfs_inode **i_tab, + int num_inodes) +{ + int i; + + for (i = num_inodes - 1; i >= 0; i--) { + /* Skip duplicate inodes if src and target dps are the same */ + if (!i_tab[i] || (i > 0 && i_tab[i] == i_tab[i - 1])) + continue; + xfs_iunlock(i_tab[i], XFS_ILOCK_EXCL); + } +} + /* * Enter all inodes for a rename transaction into a sorted array. */ @@ -2743,7 +2901,7 @@ xfs_sort_for_rename( struct xfs_inode **i_tab,/* out: sorted array of inodes */ int *num_inodes) /* in/out: inodes in array */ { - int i, j; + int i; ASSERT(*num_inodes == __XFS_SORT_INODES); memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); @@ -2765,17 +2923,26 @@ xfs_sort_for_rename( i_tab[i++] = wip; *num_inodes = i; + xfs_sort_inodes(i_tab, *num_inodes); +} + +void +xfs_sort_inodes( + struct xfs_inode **i_tab, + unsigned int num_inodes) +{ + int i, j; + + ASSERT(num_inodes <= __XFS_SORT_INODES); + /* * Sort the elements via bubble sort. (Remember, there are at * most 5 elements to sort, so this is adequate.) */ - for (i = 0; i < *num_inodes; i++) { - for (j = 1; j < *num_inodes; j++) { - if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - struct xfs_inode *temp = i_tab[j]; - i_tab[j] = i_tab[j-1]; - i_tab[j-1] = temp; - } + for (i = 0; i < num_inodes; i++) { + for (j = 1; j < num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) + swap(i_tab[j], i_tab[j - 1]); } } } @@ -2805,15 +2972,17 @@ xfs_cross_rename( struct xfs_inode *dp1, struct xfs_name *name1, struct xfs_inode *ip1, + struct xfs_parent_args *ip1_ppargs, struct xfs_inode *dp2, struct xfs_name *name2, struct xfs_inode *ip2, + struct xfs_parent_args *ip2_ppargs, int spaceres) { - int error = 0; - int ip1_flags = 0; - int ip2_flags = 0; - int dp2_flags = 0; + int error = 0; + int ip1_flags = 0; + int ip2_flags = 0; + int dp2_flags = 0; /* Swap inode number for dirent in first parent */ error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); @@ -2882,6 +3051,21 @@ xfs_cross_rename( } } + /* Schedule parent pointer replacements */ + if (ip1_ppargs) { + error = xfs_parent_replacename(tp, ip1_ppargs, dp1, name1, dp2, + name2, ip1); + if (error) + goto out_trans_abort; + } + + if (ip2_ppargs) { + error = xfs_parent_replacename(tp, ip2_ppargs, dp2, name2, dp1, + name1, ip2); + if (error) + goto out_trans_abort; + } + if (ip1_flags) { xfs_trans_ichgtime(tp, ip1, ip1_flags); xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); @@ -2937,7 +3121,7 @@ xfs_rename_alloc_whiteout( int error; error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, - &tmpfile); + xfs_has_parent(dp->i_mount), &tmpfile); if (error) return error; @@ -2981,6 +3165,9 @@ xfs_rename( struct xfs_trans *tp; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; + struct xfs_parent_args *src_ppargs = NULL; + struct xfs_parent_args *tgt_ppargs = NULL; + struct xfs_parent_args *wip_ppargs = NULL; int i; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); @@ -3012,9 +3199,26 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); + error = xfs_parent_start(mp, &src_ppargs); + if (error) + goto out_release_wip; + + if (wip) { + error = xfs_parent_start(mp, &wip_ppargs); + if (error) + goto out_src_ppargs; + } + + if (target_ip) { + error = xfs_parent_start(mp, &tgt_ppargs); + if (error) + goto out_wip_ppargs; + } + retry: nospace_error = 0; - spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); + spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL, + target_name->len, wip != NULL); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { nospace_error = error; @@ -3023,14 +3227,26 @@ retry: &tp); } if (error) - goto out_release_wip; + goto out_tgt_ppargs; + + /* + * We don't allow reservationless renaming when parent pointers are + * enabled because we can't back out if the xattrs must grow. + */ + if (src_ppargs && nospace_error) { + error = nospace_error; + xfs_trans_cancel(tp); + goto out_tgt_ppargs; + } /* * Attach the dquots to the inodes */ error = xfs_qm_vop_rename_dqattach(inodes); - if (error) - goto out_trans_cancel; + if (error) { + xfs_trans_cancel(tp); + goto out_tgt_ppargs; + } /* * Lock all the participating inodes. Depending upon whether @@ -3041,18 +3257,16 @@ retry: xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* - * Join all the inodes to the transaction. From this point on, - * we can rely on either trans_commit or trans_cancel to unlock - * them. + * Join all the inodes to the transaction. */ - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, 0); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, 0); + xfs_trans_ijoin(tp, src_ip, 0); if (target_ip) - xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_ip, 0); if (wip) - xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, wip, 0); /* * If we are using project inheritance, we only allow renames @@ -3066,10 +3280,13 @@ retry: } /* RENAME_EXCHANGE is unique from here on. */ - if (flags & RENAME_EXCHANGE) - return xfs_cross_rename(tp, src_dp, src_name, src_ip, - target_dp, target_name, target_ip, - spaceres); + if (flags & RENAME_EXCHANGE) { + error = xfs_cross_rename(tp, src_dp, src_name, src_ip, + src_ppargs, target_dp, target_name, target_ip, + tgt_ppargs, spaceres); + nospace_error = 0; + goto out_unlock; + } /* * Try to reserve quota to handle an expansion of the target directory. @@ -3083,6 +3300,7 @@ retry: if (error == -EDQUOT || error == -ENOSPC) { if (!retried) { xfs_trans_cancel(tp); + xfs_iunlock_rename(inodes, num_inodes); xfs_blockgc_free_quota(target_dp, 0); retried = true; goto retry; @@ -3097,6 +3315,15 @@ retry: } /* + * We don't allow quotaless renaming when parent pointers are enabled + * because we can't back out if the xattrs must grow. + */ + if (src_ppargs && nospace_error) { + error = nospace_error; + goto out_trans_cancel; + } + + /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. */ @@ -3142,7 +3369,7 @@ retry: pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); - error = xfs_read_agi(pag, tp, &bp); + error = xfs_read_agi(pag, tp, 0, &bp); xfs_perag_put(pag); if (error) goto out_trans_cancel; @@ -3288,6 +3515,28 @@ retry: if (error) goto out_trans_cancel; + /* Schedule parent pointer updates. */ + if (wip_ppargs) { + error = xfs_parent_addname(tp, wip_ppargs, src_dp, src_name, + wip); + if (error) + goto out_trans_cancel; + } + + if (src_ppargs) { + error = xfs_parent_replacename(tp, src_ppargs, src_dp, + src_name, target_dp, target_name, src_ip); + if (error) + goto out_trans_cancel; + } + + if (tgt_ppargs) { + error = xfs_parent_removename(tp, tgt_ppargs, target_dp, + target_name, target_ip); + if (error) + goto out_trans_cancel; + } + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) @@ -3309,12 +3558,19 @@ retry: xfs_dir_update_hook(src_dp, wip, 1, src_name); error = xfs_finish_rename(tp); - if (wip) - xfs_irele(wip); - return error; + nospace_error = 0; + goto out_unlock; out_trans_cancel: xfs_trans_cancel(tp); +out_unlock: + xfs_iunlock_rename(inodes, num_inodes); +out_tgt_ppargs: + xfs_parent_finish(mp, tgt_ppargs); +out_wip_ppargs: + xfs_parent_finish(mp, wip_ppargs); +out_src_ppargs: + xfs_parent_finish(mp, src_ppargs); out_release_wip: if (wip) xfs_irele(wip); @@ -3814,7 +4070,7 @@ xfs_inode_reload_unlinked_bucket( /* Grab the first inode in the list */ pag = xfs_perag_get(mp, agno); - error = xfs_ialloc_read_agi(pag, tp, &agibp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agibp); xfs_perag_put(pag); if (error) return error; @@ -3946,3 +4202,77 @@ xfs_inode_count_blocks( xfs_bmap_count_leaves(ifp, rblocks); *dblocks = ip->i_nblocks - *rblocks; } + +static void +xfs_wait_dax_page( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + schedule(); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); +} + +int +xfs_break_dax_layouts( + struct inode *inode, + bool *retry) +{ + struct page *page; + + xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); + + page = dax_layout_busy_page(inode->i_mapping); + if (!page) + return 0; + + *retry = true; + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, xfs_wait_dax_page(inode)); +} + +int +xfs_break_layouts( + struct inode *inode, + uint *iolock, + enum layout_break_reason reason) +{ + bool retry; + int error; + + xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); + + do { + retry = false; + switch (reason) { + case BREAK_UNMAP: + error = xfs_break_dax_layouts(inode, &retry); + if (error || retry) + break; + fallthrough; + case BREAK_WRITE: + error = xfs_break_leased_layouts(inode, iolock, &retry); + break; + default: + WARN_ON_ONCE(1); + error = -EINVAL; + } + } while (error == 0 && retry); + + return error; +} + +/* Returns the size of fundamental allocation unit for a file, in bytes. */ +unsigned int +xfs_inode_alloc_unitsize( + struct xfs_inode *ip) +{ + unsigned int blocks = 1; + + if (XFS_IS_REALTIME_INODE(ip)) + blocks = ip->i_mount->m_sb.sb_rextsize; + + return XFS_FSB_TO_B(ip->i_mount, blocks); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ab46ffb3ac19..292b90b5f2ac 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -207,13 +207,13 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) * i_flags helper functions */ static inline void -__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +__xfs_iflags_set(xfs_inode_t *ip, unsigned long flags) { ip->i_flags |= flags; } static inline void -xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_set(xfs_inode_t *ip, unsigned long flags) { spin_lock(&ip->i_flags_lock); __xfs_iflags_set(ip, flags); @@ -221,7 +221,7 @@ xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) } static inline void -xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags) { spin_lock(&ip->i_flags_lock); ip->i_flags &= ~flags; @@ -229,13 +229,13 @@ xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags) } static inline int -__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +__xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) { return (ip->i_flags & flags); } static inline int -xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) { int ret; spin_lock(&ip->i_flags_lock); @@ -245,7 +245,7 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) } static inline int -xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned long flags) { int ret; @@ -258,7 +258,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) } static inline int -xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags) { int ret; @@ -312,6 +312,15 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) } /* + * Decide if this file is a realtime file whose data allocation unit is larger + * than a single filesystem block. + */ +static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) +{ + return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; +} + +/* * Return the buftarg used for data allocations on a given inode. */ #define xfs_inode_buftarg(ip) \ @@ -513,7 +522,7 @@ int xfs_create(struct mnt_idmap *idmap, umode_t mode, dev_t rdev, bool need_xattr, struct xfs_inode **ipp); int xfs_create_tmpfile(struct mnt_idmap *idmap, - struct xfs_inode *dp, umode_t mode, + struct xfs_inode *dp, umode_t mode, bool init_xattrs, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); @@ -565,16 +574,10 @@ xfs_itruncate_extents( return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0); } -/* from xfs_file.c */ int xfs_break_dax_layouts(struct inode *inode, bool *retry); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); -/* from xfs_iops.c */ -extern void xfs_setup_inode(struct xfs_inode *ip); -extern void xfs_setup_iops(struct xfs_inode *ip); -extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); - static inline void xfs_update_stable_writes(struct xfs_inode *ip) { if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) @@ -613,11 +616,20 @@ extern struct kmem_cache *xfs_inode_cache; bool xfs_inode_needs_inactive(struct xfs_inode *ip); +int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip); +int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *ip); +struct xfs_inode *xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino); + void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); +int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip); +void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip); +void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode); +void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes); static inline bool xfs_inode_unlinked_incomplete( @@ -631,6 +643,7 @@ int xfs_inode_reload_unlinked(struct xfs_inode *ip); bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, xfs_filblks_t *dblocks, xfs_filblks_t *rblocks); +unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip); struct xfs_dir_update_params { const struct xfs_inode *dp; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d0e2cec6210d..f0117188f302 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -23,11 +23,9 @@ #include "xfs_fsops.h" #include "xfs_discard.h" #include "xfs_quota.h" -#include "xfs_export.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_trans.h" -#include "xfs_acl.h" #include "xfs_btree.h" #include <linux/fsmap.h> #include "xfs_fsmap.h" @@ -39,596 +37,13 @@ #include "xfs_ioctl.h" #include "xfs_xattr.h" #include "xfs_rtbitmap.h" +#include "xfs_file.h" +#include "xfs_exchrange.h" +#include "xfs_handle.h" #include <linux/mount.h> -#include <linux/namei.h> #include <linux/fileattr.h> -/* - * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to - * a file or fs handle. - * - * XFS_IOC_PATH_TO_FSHANDLE - * returns fs handle for a mount point or path within that mount point - * XFS_IOC_FD_TO_HANDLE - * returns full handle for a FD opened in user space - * XFS_IOC_PATH_TO_HANDLE - * returns full handle for a path - */ -int -xfs_find_handle( - unsigned int cmd, - xfs_fsop_handlereq_t *hreq) -{ - int hsize; - xfs_handle_t handle; - struct inode *inode; - struct fd f = {NULL}; - struct path path; - int error; - struct xfs_inode *ip; - - if (cmd == XFS_IOC_FD_TO_HANDLE) { - f = fdget(hreq->fd); - if (!f.file) - return -EBADF; - inode = file_inode(f.file); - } else { - error = user_path_at(AT_FDCWD, hreq->path, 0, &path); - if (error) - return error; - inode = d_inode(path.dentry); - } - ip = XFS_I(inode); - - /* - * We can only generate handles for inodes residing on a XFS filesystem, - * and only for regular files, directories or symbolic links. - */ - error = -EINVAL; - if (inode->i_sb->s_magic != XFS_SB_MAGIC) - goto out_put; - - error = -EBADF; - if (!S_ISREG(inode->i_mode) && - !S_ISDIR(inode->i_mode) && - !S_ISLNK(inode->i_mode)) - goto out_put; - - - memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); - - if (cmd == XFS_IOC_PATH_TO_FSHANDLE) { - /* - * This handle only contains an fsid, zero the rest. - */ - memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); - hsize = sizeof(xfs_fsid_t); - } else { - handle.ha_fid.fid_len = sizeof(xfs_fid_t) - - sizeof(handle.ha_fid.fid_len); - handle.ha_fid.fid_pad = 0; - handle.ha_fid.fid_gen = inode->i_generation; - handle.ha_fid.fid_ino = ip->i_ino; - hsize = sizeof(xfs_handle_t); - } - - error = -EFAULT; - if (copy_to_user(hreq->ohandle, &handle, hsize) || - copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) - goto out_put; - - error = 0; - - out_put: - if (cmd == XFS_IOC_FD_TO_HANDLE) - fdput(f); - else - path_put(&path); - return error; -} - -/* - * No need to do permission checks on the various pathname components - * as the handle operations are privileged. - */ -STATIC int -xfs_handle_acceptable( - void *context, - struct dentry *dentry) -{ - return 1; -} - -/* - * Convert userspace handle data into a dentry. - */ -struct dentry * -xfs_handle_to_dentry( - struct file *parfilp, - void __user *uhandle, - u32 hlen) -{ - xfs_handle_t handle; - struct xfs_fid64 fid; - - /* - * Only allow handle opens under a directory. - */ - if (!S_ISDIR(file_inode(parfilp)->i_mode)) - return ERR_PTR(-ENOTDIR); - - if (hlen != sizeof(xfs_handle_t)) - return ERR_PTR(-EINVAL); - if (copy_from_user(&handle, uhandle, hlen)) - return ERR_PTR(-EFAULT); - if (handle.ha_fid.fid_len != - sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len)) - return ERR_PTR(-EINVAL); - - memset(&fid, 0, sizeof(struct fid)); - fid.ino = handle.ha_fid.fid_ino; - fid.gen = handle.ha_fid.fid_gen; - - return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3, - FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, - xfs_handle_acceptable, NULL); -} - -STATIC struct dentry * -xfs_handlereq_to_dentry( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq) -{ - return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); -} - -int -xfs_open_by_handle( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq) -{ - const struct cred *cred = current_cred(); - int error; - int fd; - int permflag; - struct file *filp; - struct inode *inode; - struct dentry *dentry; - fmode_t fmode; - struct path path; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - dentry = xfs_handlereq_to_dentry(parfilp, hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - inode = d_inode(dentry); - - /* Restrict xfs_open_by_handle to directories & regular files. */ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - error = -EPERM; - goto out_dput; - } - -#if BITS_PER_LONG != 32 - hreq->oflags |= O_LARGEFILE; -#endif - - permflag = hreq->oflags; - fmode = OPEN_FMODE(permflag); - if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && - (fmode & FMODE_WRITE) && IS_APPEND(inode)) { - error = -EPERM; - goto out_dput; - } - - if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - error = -EPERM; - goto out_dput; - } - - /* Can't write directories. */ - if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { - error = -EISDIR; - goto out_dput; - } - - fd = get_unused_fd_flags(0); - if (fd < 0) { - error = fd; - goto out_dput; - } - - path.mnt = parfilp->f_path.mnt; - path.dentry = dentry; - filp = dentry_open(&path, hreq->oflags, cred); - dput(dentry); - if (IS_ERR(filp)) { - put_unused_fd(fd); - return PTR_ERR(filp); - } - - if (S_ISREG(inode->i_mode)) { - filp->f_flags |= O_NOATIME; - filp->f_mode |= FMODE_NOCMTIME; - } - - fd_install(fd, filp); - return fd; - - out_dput: - dput(dentry); - return error; -} - -int -xfs_readlink_by_handle( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq) -{ - struct dentry *dentry; - __u32 olen; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - dentry = xfs_handlereq_to_dentry(parfilp, hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - /* Restrict this handle operation to symlinks only. */ - if (!d_is_symlink(dentry)) { - error = -EINVAL; - goto out_dput; - } - - if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { - error = -EFAULT; - goto out_dput; - } - - error = vfs_readlink(dentry, hreq->ohandle, olen); - - out_dput: - dput(dentry); - return error; -} - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -static void -xfs_ioc_attr_put_listent( - struct xfs_attr_list_context *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) -{ - struct xfs_attrlist *alist = context->buffer; - struct xfs_attrlist_ent *aep; - int arraytop; - - ASSERT(!context->seen_enough); - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*alist)); - ASSERT(context->firstu <= context->bufsize); - - /* - * Only list entries in the right namespace. - */ - if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK)) - return; - - arraytop = sizeof(*alist) + - context->count * sizeof(alist->al_offset[0]); - - /* decrement by the actual bytes used by the attr */ - context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) + - namelen + 1, sizeof(uint32_t)); - if (context->firstu < arraytop) { - trace_xfs_attr_list_full(context); - alist->al_more = 1; - context->seen_enough = 1; - return; - } - - aep = context->buffer + context->firstu; - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[namelen] = 0; - alist->al_offset[context->count++] = context->firstu; - alist->al_count = context->count; - trace_xfs_attr_list_add(context); -} - -static unsigned int -xfs_attr_filter( - u32 ioc_flags) -{ - if (ioc_flags & XFS_IOC_ATTR_ROOT) - return XFS_ATTR_ROOT; - if (ioc_flags & XFS_IOC_ATTR_SECURE) - return XFS_ATTR_SECURE; - return 0; -} - -static unsigned int -xfs_attr_flags( - u32 ioc_flags) -{ - if (ioc_flags & XFS_IOC_ATTR_CREATE) - return XATTR_CREATE; - if (ioc_flags & XFS_IOC_ATTR_REPLACE) - return XATTR_REPLACE; - return 0; -} - -int -xfs_ioc_attr_list( - struct xfs_inode *dp, - void __user *ubuf, - size_t bufsize, - int flags, - struct xfs_attrlist_cursor __user *ucursor) -{ - struct xfs_attr_list_context context = { }; - struct xfs_attrlist *alist; - void *buffer; - int error; - - if (bufsize < sizeof(struct xfs_attrlist) || - bufsize > XFS_XATTR_LIST_MAX) - return -EINVAL; - - /* - * Reject flags, only allow namespaces. - */ - if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) - return -EINVAL; - if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) - return -EINVAL; - - /* - * Validate the cursor. - */ - if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor))) - return -EFAULT; - if (context.cursor.pad1 || context.cursor.pad2) - return -EINVAL; - if (!context.cursor.initted && - (context.cursor.hashval || context.cursor.blkno || - context.cursor.offset)) - return -EINVAL; - - buffer = kvzalloc(bufsize, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - /* - * Initialize the output buffer. - */ - context.dp = dp; - context.resynch = 1; - context.attr_filter = xfs_attr_filter(flags); - context.buffer = buffer; - context.bufsize = round_down(bufsize, sizeof(uint32_t)); - context.firstu = context.bufsize; - context.put_listent = xfs_ioc_attr_put_listent; - - alist = context.buffer; - alist->al_count = 0; - alist->al_more = 0; - alist->al_offset[0] = context.bufsize; - - error = xfs_attr_list(&context); - if (error) - goto out_free; - - if (copy_to_user(ubuf, buffer, bufsize) || - copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) - error = -EFAULT; -out_free: - kvfree(buffer); - return error; -} - -STATIC int -xfs_attrlist_by_handle( - struct file *parfilp, - struct xfs_fsop_attrlist_handlereq __user *p) -{ - struct xfs_fsop_attrlist_handlereq al_hreq; - struct dentry *dentry; - int error = -ENOMEM; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) - return -EFAULT; - - dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer, - al_hreq.buflen, al_hreq.flags, &p->pos); - dput(dentry); - return error; -} - -static int -xfs_attrmulti_attr_get( - struct inode *inode, - unsigned char *name, - unsigned char __user *ubuf, - uint32_t *len, - uint32_t flags) -{ - struct xfs_da_args args = { - .dp = XFS_I(inode), - .attr_filter = xfs_attr_filter(flags), - .attr_flags = xfs_attr_flags(flags), - .name = name, - .namelen = strlen(name), - .valuelen = *len, - }; - int error; - - if (*len > XFS_XATTR_SIZE_MAX) - return -EINVAL; - - error = xfs_attr_get(&args); - if (error) - goto out_kfree; - - *len = args.valuelen; - if (copy_to_user(ubuf, args.value, args.valuelen)) - error = -EFAULT; - -out_kfree: - kvfree(args.value); - return error; -} - -static int -xfs_attrmulti_attr_set( - struct inode *inode, - unsigned char *name, - const unsigned char __user *ubuf, - uint32_t len, - uint32_t flags) -{ - struct xfs_da_args args = { - .dp = XFS_I(inode), - .attr_filter = xfs_attr_filter(flags), - .attr_flags = xfs_attr_flags(flags), - .name = name, - .namelen = strlen(name), - }; - int error; - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - - if (ubuf) { - if (len > XFS_XATTR_SIZE_MAX) - return -EINVAL; - args.value = memdup_user(ubuf, len); - if (IS_ERR(args.value)) - return PTR_ERR(args.value); - args.valuelen = len; - } - - error = xfs_attr_change(&args); - if (!error && (flags & XFS_IOC_ATTR_ROOT)) - xfs_forget_acl(inode, name); - kfree(args.value); - return error; -} - -int -xfs_ioc_attrmulti_one( - struct file *parfilp, - struct inode *inode, - uint32_t opcode, - void __user *uname, - void __user *value, - uint32_t *len, - uint32_t flags) -{ - unsigned char *name; - int error; - - if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE)) - return -EINVAL; - - name = strndup_user(uname, MAXNAMELEN); - if (IS_ERR(name)) - return PTR_ERR(name); - - switch (opcode) { - case ATTR_OP_GET: - error = xfs_attrmulti_attr_get(inode, name, value, len, flags); - break; - case ATTR_OP_REMOVE: - value = NULL; - *len = 0; - fallthrough; - case ATTR_OP_SET: - error = mnt_want_write_file(parfilp); - if (error) - break; - error = xfs_attrmulti_attr_set(inode, name, value, *len, flags); - mnt_drop_write_file(parfilp); - break; - default: - error = -EINVAL; - break; - } - - kfree(name); - return error; -} - -STATIC int -xfs_attrmulti_by_handle( - struct file *parfilp, - void __user *arg) -{ - int error; - xfs_attr_multiop_t *ops; - xfs_fsop_attrmulti_handlereq_t am_hreq; - struct dentry *dentry; - unsigned int i, size; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) - return -EFAULT; - - /* overflow check */ - if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) - return -E2BIG; - - dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - error = -E2BIG; - size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); - if (!size || size > 16 * PAGE_SIZE) - goto out_dput; - - ops = memdup_user(am_hreq.ops, size); - if (IS_ERR(ops)) { - error = PTR_ERR(ops); - goto out_dput; - } - - error = 0; - for (i = 0; i < am_hreq.opcount; i++) { - ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, - d_inode(dentry), ops[i].am_opcode, - ops[i].am_attrname, ops[i].am_attrvalue, - &ops[i].am_length, ops[i].am_flags); - } - - if (copy_to_user(am_hreq.ops, ops, size)) - error = -EFAULT; - - kfree(ops); - out_dput: - dput(dentry); - return error; -} - /* Return 0 on success or positive error */ int xfs_fsbulkstat_one_fmt( @@ -1640,30 +1055,6 @@ out_free: return error; } -STATIC int -xfs_ioc_scrub_metadata( - struct file *file, - void __user *arg) -{ - struct xfs_scrub_metadata scrub; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&scrub, arg, sizeof(scrub))) - return -EFAULT; - - error = xfs_scrub_metadata(file, &scrub); - if (error) - return error; - - if (copy_to_user(arg, &scrub, sizeof(scrub))) - return -EFAULT; - - return 0; -} - int xfs_ioc_swapext( xfs_swapext_t *sxp) @@ -2010,7 +1401,10 @@ xfs_file_ioctl( case XFS_IOC_FSGETXATTRA: return xfs_ioc_fsgetxattra(ip, arg); - + case XFS_IOC_GETPARENTS: + return xfs_ioc_getparents(filp, arg); + case XFS_IOC_GETPARENTS_BY_HANDLE: + return xfs_ioc_getparents_by_handle(filp, arg); case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: case XFS_IOC_GETBMAPX: @@ -2019,6 +1413,8 @@ xfs_file_ioctl( case FS_IOC_GETFSMAP: return xfs_ioc_getfsmap(ip, arg); + case XFS_IOC_SCRUBV_METADATA: + return xfs_ioc_scrubv_metadata(filp, arg); case XFS_IOC_SCRUB_METADATA: return xfs_ioc_scrub_metadata(filp, arg); @@ -2169,6 +1565,9 @@ xfs_file_ioctl( return error; } + case XFS_IOC_EXCHANGE_RANGE: + return xfs_ioc_exchange_range(filp, arg); + default: return -ENOTTY; } diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 38be600b5e1e..12124946f347 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -15,34 +15,6 @@ xfs_ioc_swapext( xfs_swapext_t *sxp); extern int -xfs_find_handle( - unsigned int cmd, - xfs_fsop_handlereq_t *hreq); - -extern int -xfs_open_by_handle( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq); - -extern int -xfs_readlink_by_handle( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq); - -int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, - uint32_t opcode, void __user *uname, void __user *value, - uint32_t *len, uint32_t flags); -int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, - size_t bufsize, int flags, - struct xfs_attrlist_cursor __user *ucursor); - -extern struct dentry * -xfs_handle_to_dentry( - struct file *parfilp, - void __user *uhandle, - u32 hlen); - -extern int xfs_fileattr_get( struct dentry *dentry, struct fileattr *fa); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index ee35eea1ecce..b64785dc4354 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -24,6 +24,7 @@ #include "xfs_ioctl32.h" #include "xfs_trace.h" #include "xfs_sb.h" +#include "xfs_handle.h" #define _NATIVE_IOC(cmd, type) \ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 4087af7f3c9f..378342673925 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -28,6 +28,7 @@ #include "xfs_dquot.h" #include "xfs_reflink.h" #include "xfs_health.h" +#include "xfs_rtbitmap.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -298,9 +299,7 @@ xfs_iomap_write_direct( if (error) return error; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, nr_exts); + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, nr_exts); if (error) goto out_trans_cancel; @@ -321,14 +320,6 @@ xfs_iomap_write_direct( if (error) goto out_unlock; - /* - * Copy any maps to caller's array and return any error. - */ - if (nimaps == 0) { - error = -ENOSPC; - goto out_unlock; - } - if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = xfs_alert_fsblock_zero(ip, imap); @@ -404,6 +395,29 @@ xfs_quota_calc_throttle( } } +static int64_t +xfs_iomap_freesp( + struct percpu_counter *counter, + uint64_t low_space[XFS_LOWSP_MAX], + int *shift) +{ + int64_t freesp; + + freesp = percpu_counter_read_positive(counter); + if (freesp < low_space[XFS_LOWSP_5_PCNT]) { + *shift = 2; + if (freesp < low_space[XFS_LOWSP_4_PCNT]) + (*shift)++; + if (freesp < low_space[XFS_LOWSP_3_PCNT]) + (*shift)++; + if (freesp < low_space[XFS_LOWSP_2_PCNT]) + (*shift)++; + if (freesp < low_space[XFS_LOWSP_1_PCNT]) + (*shift)++; + } + return freesp; +} + /* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size @@ -486,18 +500,13 @@ xfs_iomap_prealloc_size( alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN), alloc_blocks); - freesp = percpu_counter_read_positive(&mp->m_fdblocks); - if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { - shift = 2; - if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) - shift++; - } + if (unlikely(XFS_IS_REALTIME_INODE(ip))) + freesp = xfs_rtx_to_rtb(mp, + xfs_iomap_freesp(&mp->m_frextents, + mp->m_low_rtexts, &shift)); + else + freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space, + &shift); /* * Check each quota to cap the prealloc size, provide a shift value to @@ -606,11 +615,8 @@ xfs_iomap_write_unwritten( if (error) return error; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_WRITE_UNWRITTEN_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) goto error_on_bmapi_transaction; @@ -982,8 +988,6 @@ xfs_buffered_write_iomap_begin( return xfs_direct_write_iomap_begin(inode, offset, count, flags, iomap, srcmap); - ASSERT(!XFS_IS_REALTIME_INODE(ip)); - error = xfs_qm_dqattach(ip); if (error) return error; @@ -1023,6 +1027,24 @@ xfs_buffered_write_iomap_begin( } /* + * For zeroing, trim a delalloc extent that extends beyond the EOF + * block. If it starts beyond the EOF block, convert it to an + * unwritten extent. + */ + if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && + isnullstartblock(imap.br_startblock)) { + xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + + if (offset_fsb >= eof_fsb) + goto convert_delay; + if (end_fsb > eof_fsb) { + end_fsb = eof_fsb; + xfs_trim_extent(&imap, offset_fsb, + end_fsb - offset_fsb); + } + } + + /* * Search the COW fork extent list even if we did not find a data fork * extent. This serves two purposes: first this implements the * speculative preallocation using cowextsize, so that we also unshare @@ -1158,15 +1180,26 @@ retry: * them out if the write happens to fail. */ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: seq = xfs_iomap_inode_sequence(ip, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); +convert_delay: + xfs_iunlock(ip, lockmode); + truncate_pagecache(inode, offset); + error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset, + iomap, NULL); + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap); + return 0; + found_cow: seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { @@ -1174,17 +1207,17 @@ found_cow: if (error) goto out_unlock; seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; } @@ -1194,8 +1227,8 @@ xfs_buffered_write_delalloc_punch( loff_t offset, loff_t length) { - return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, - offset + length); + xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length); + return 0; } static int diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 66f8c47642e8..ff222827e550 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -25,6 +25,7 @@ #include "xfs_error.h" #include "xfs_ioctl.h" #include "xfs_xattr.h" +#include "xfs_file.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -62,7 +63,7 @@ xfs_initxattrs( .value = xattr->value, .valuelen = xattr->value_len, }; - error = xfs_attr_change(&args); + error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT); if (error < 0) break; } @@ -156,6 +157,8 @@ xfs_create_need_xattr( if (dir->i_sb->s_security) return true; #endif + if (xfs_has_parent(XFS_I(dir)->i_mount)) + return true; return false; } @@ -200,7 +203,18 @@ xfs_generic_create( xfs_create_need_xattr(dir, default_acl, acl), &ip); } else { - error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, &ip); + bool init_xattrs = false; + + /* + * If this temporary file will be linkable, set up the file + * with an attr fork to receive a parent pointer. + */ + if (!(tmpfile->f_flags & O_EXCL) && + xfs_has_parent(XFS_I(dir)->i_mount)) + init_xattrs = true; + + error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, + init_xattrs, &ip); } if (unlikely(error)) goto out_free_acl; @@ -364,6 +378,9 @@ xfs_vn_link( if (unlikely(error)) return error; + if (IS_PRIVATE(inode)) + return -EPERM; + error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) return error; @@ -521,7 +538,7 @@ xfs_stat_blksize( * always return the realtime extent size. */ if (XFS_IS_REALTIME_INODE(ip)) - return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip)); + return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip) ? : 1); /* * Allow large block sizes to be reported to userspace programs if the diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 7f84a0843b24..3c1a2605ffd2 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -8,9 +8,6 @@ struct xfs_inode; -extern const struct file_operations xfs_file_operations; -extern const struct file_operations xfs_dir_file_operations; - extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); int xfs_vn_setattr_size(struct mnt_idmap *idmap, @@ -19,4 +16,8 @@ int xfs_vn_setattr_size(struct mnt_idmap *idmap, int xfs_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr); +extern void xfs_setup_inode(struct xfs_inode *ip); +extern void xfs_setup_iops(struct xfs_inode *ip); +extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); + #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 95fc31b9f87d..c0757ab99495 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -97,6 +97,14 @@ xfs_bulkstat_one_int( vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); + /* If this is a private inode, don't leak its details to userspace. */ + if (IS_PRIVATE(inode)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_irele(ip); + error = -EINVAL; + goto out_advance; + } + /* xfs_iget returns the following without needing * further change. */ diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 01b55f03a102..730c8d48da28 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -268,7 +268,7 @@ xfs_iwalk_ag_start( /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; - error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp); if (error) return error; *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp); @@ -386,7 +386,7 @@ xfs_iwalk_run_callbacks( } /* ...and recreate the cursor just past where we left off. */ - error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp); + error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp); if (error) return error; *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 8f07c9f6157f..ac355328121a 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -198,6 +198,11 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) return x; } +static inline bool isaligned_64(uint64_t x, uint32_t y) +{ + return do_div(x, y) == 0; +} + /* If @b is a power of 2, return log2(b). Else return -1. */ static inline int8_t log2_if_power2(unsigned long b) { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5004f23d344e..416c15494983 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1448,7 +1448,7 @@ xfs_log_work_queue( * Clear the log incompat flags if we have the opportunity. * * This only happens if we're about to log the second dummy transaction as part - * of covering the log and we can get the log incompat feature usage lock. + * of covering the log. */ static inline void xlog_clear_incompat( @@ -1463,11 +1463,7 @@ xlog_clear_incompat( if (log->l_covered_state != XLOG_STATE_COVER_DONE2) return; - if (!down_write_trylock(&log->l_incompat_users)) - return; - xfs_clear_incompat_log_features(mp); - up_write(&log->l_incompat_users); } /* @@ -1585,8 +1581,6 @@ xlog_alloc_log( } log->l_sectBBsize = 1 << log2_size; - init_rwsem(&log->l_incompat_users); - xlog_get_iclog_buffer_size(mp, log); spin_lock_init(&log->l_icloglock); @@ -3871,23 +3865,3 @@ xfs_log_check_lsn( return valid; } - -/* - * Notify the log that we're about to start using a feature that is protected - * by a log incompat feature flag. This will prevent log covering from - * clearing those flags. - */ -void -xlog_use_incompat_feat( - struct xlog *log) -{ - down_read(&log->l_incompat_users); -} - -/* Notify the log that we've finished using log incompat features. */ -void -xlog_drop_incompat_feat( - struct xlog *log) -{ - up_read(&log->l_incompat_users); -} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 2728886c2963..d69acf881153 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -159,8 +159,6 @@ bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); -void xlog_use_incompat_feat(struct xlog *log); -void xlog_drop_incompat_feat(struct xlog *log); int xfs_attr_use_log_assist(struct xfs_mount *mp); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 73f5b7f628f4..f51cbc6405c1 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1378,7 +1378,7 @@ out_abort_free_ticket: */ static void xlog_cil_push_background( - struct xlog *log) __releases(cil->xc_ctx_lock) + struct xlog *log) { struct xfs_cil *cil = log->l_cilp; int space_used = atomic_read(&cil->xc_ctx->space_used); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index e30c06ec20e3..40e22ec0fbe6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -450,9 +450,6 @@ struct xlog { xfs_lsn_t l_recovery_lsn; uint32_t l_iclog_roundoff;/* padding roundoff */ - - /* Users of log incompat features should take a read lock. */ - struct rw_semaphore l_incompat_users; }; /* @@ -623,7 +620,8 @@ xlog_wait( remove_wait_queue(wq, &wait); } -int xlog_wait_on_iclog(struct xlog_in_core *iclog); +int xlog_wait_on_iclog(struct xlog_in_core *iclog) + __releases(iclog->ic_log->l_icloglock); /* * The LSN is valid so long as it is behind the current LSN. If it isn't, this @@ -683,7 +681,7 @@ xlog_valid_lsn( * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() * will do direct reclaim and compaction in the slow path, both of which are * horrendously expensive. We just want kmalloc to fail fast and fall back to - * vmalloc if it can't get somethign straight away from the free lists or + * vmalloc if it can't get something straight away from the free lists or * buddy allocator. Hence we have to open code kvmalloc outselves here. * * This assumes that the caller uses memalloc_nofs_save task context here, so diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 13f1d2e91540..4fe627991e86 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1767,6 +1767,37 @@ xlog_recover_iget( return 0; } +/* + * Get an inode so that we can recover a log operation. + * + * Log intent items that target inodes effectively contain a file handle. + * Check that the generation number matches the intent item like we do for + * other file handles. Log intent items defined after this validation weakness + * was identified must use this function. + */ +int +xlog_recover_iget_handle( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t gen, + struct xfs_inode **ipp) +{ + struct xfs_inode *ip; + int error; + + error = xlog_recover_iget(mp, ino, &ip); + if (error) + return error; + + if (VFS_I(ip)->i_generation != gen) { + xfs_irele(ip); + return -EFSCORRUPTED; + } + + *ipp = ip; + return 0; +} + /****************************************************************************** * * Log recover routines @@ -1789,6 +1820,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_bud_item_ops, &xlog_attri_item_ops, &xlog_attrd_item_ops, + &xlog_xmi_item_ops, + &xlog_xmd_item_ops, }; static const struct xlog_recover_item_ops * @@ -2656,7 +2689,7 @@ xlog_recover_clear_agi_bucket( if (error) goto out_error; - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, 0, &agibp); if (error) goto out_abort; @@ -2772,7 +2805,7 @@ xlog_recover_iunlink_ag( int bucket; int error; - error = xfs_read_agi(pag, NULL, &agibp); + error = xfs_read_agi(pag, NULL, 0, &agibp); if (error) { /* * AGI is b0rked. Don't process it. @@ -2966,7 +2999,7 @@ xlog_do_recovery_pass( int error = 0, h_size, h_len; int error2 = 0; int bblks, split_bblks; - int hblks, split_hblks, wrapped_hblks; + int hblks = 1, split_hblks, wrapped_hblks; int i; struct hlist_head rhash[XLOG_RHASH_SIZE]; LIST_HEAD (buffer_list); @@ -2977,6 +3010,10 @@ xlog_do_recovery_pass( for (i = 0; i < XLOG_RHASH_SIZE; i++) INIT_HLIST_HEAD(&rhash[i]); + hbp = xlog_alloc_buffer(log, hblks); + if (!hbp) + return -ENOMEM; + /* * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. @@ -2987,10 +3024,6 @@ xlog_do_recovery_pass( * iclog header and extract the header size from it. Get a * new hbp that is the correct size. */ - hbp = xlog_alloc_buffer(log, 1); - if (!hbp) - return -ENOMEM; - error = xlog_bread(log, tail_blk, 1, hbp, &offset); if (error) goto bread_err1; @@ -3022,20 +3055,27 @@ xlog_do_recovery_pass( if (error) goto bread_err1; - hblks = xlog_logrec_hblks(log, rhead); - if (hblks != 1) { - kvfree(hbp); - hbp = xlog_alloc_buffer(log, hblks); + /* + * This open codes xlog_logrec_hblks so that we can reuse the + * fixed up h_size value calculated above. Without that we'd + * still allocate the buffer based on the incorrect on-disk + * size. + */ + if (h_size > XLOG_HEADER_CYCLE_SIZE && + (rhead->h_version & cpu_to_be32(XLOG_VERSION_2))) { + hblks = DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE); + if (hblks > 1) { + kvfree(hbp); + hbp = xlog_alloc_buffer(log, hblks); + if (!hbp) + return -ENOMEM; + } } } else { ASSERT(log->l_sectBBsize == 1); - hblks = 1; - hbp = xlog_alloc_buffer(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; } - if (!hbp) - return -ENOMEM; dbp = xlog_alloc_buffer(log, BTOBB(h_size)); if (!dbp) { kvfree(hbp); @@ -3496,21 +3536,6 @@ xlog_recover_finish( */ xfs_log_force(log->l_mp, XFS_LOG_SYNC); - /* - * Now that we've recovered the log and all the intents, we can clear - * the log incompat feature bits in the superblock because there's no - * longer anything to protect. We rely on the AIL push to write out the - * updated superblock after everything else. - */ - if (xfs_clear_incompat_log_features(log->l_mp)) { - error = xfs_sync_sb(log->l_mp, false); - if (error < 0) { - xfs_alert(log->l_mp, - "Failed to clear log incompat features on recovery"); - goto out_error; - } - } - xlog_recover_process_iunlinks(log); /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index df370eb5dc15..09eef1721ef4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -34,6 +34,7 @@ #include "xfs_health.h" #include "xfs_trace.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" #include "scrub/stats.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -230,6 +231,13 @@ reread: mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); + /* + * If logged xattrs are enabled after log recovery finishes, then set + * the opstate so that log recovery will work properly. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + xfs_set_using_logged_xattrs(mp); + /* no need to be quiet anymore, so reset the buf ops */ bp->b_ops = &xfs_sb_buf_ops; @@ -828,6 +836,15 @@ xfs_mountfs( goto out_inodegc_shrinker; } + /* + * If logged xattrs are still enabled after log recovery finishes, then + * they'll be available until unmount. Otherwise, turn them off. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + xfs_set_using_logged_xattrs(mp); + else + xfs_clear_using_logged_xattrs(mp); + /* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); xfs_blockgc_start(mp); @@ -1095,6 +1112,11 @@ xfs_unmountfs( "Freespace may not be correct on next mount."); xfs_unmount_check(mp); + /* + * Indicate that it's ok to clear log incompat bits before cleaning + * the log and writing the unmount record. + */ + xfs_set_done_with_log_incompat(mp); xfs_log_unmount(mp); xfs_da_unmount(mp); xfs_uuid_unmount(mp); @@ -1131,16 +1153,44 @@ xfs_fs_writable( return true; } -/* Adjust m_fdblocks or m_frextents. */ +void +xfs_add_freecounter( + struct xfs_mount *mp, + struct percpu_counter *counter, + uint64_t delta) +{ + bool has_resv_pool = (counter == &mp->m_fdblocks); + uint64_t res_used; + + /* + * If the reserve pool is depleted, put blocks back into it first. + * Most of the time the pool is full. + */ + if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) { + percpu_counter_add(counter, delta); + return; + } + + spin_lock(&mp->m_sb_lock); + res_used = mp->m_resblks - mp->m_resblks_avail; + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + delta -= res_used; + mp->m_resblks_avail = mp->m_resblks; + percpu_counter_add(counter, delta); + } + spin_unlock(&mp->m_sb_lock); +} + int -xfs_mod_freecounter( +xfs_dec_freecounter( struct xfs_mount *mp, struct percpu_counter *counter, - int64_t delta, + uint64_t delta, bool rsvd) { int64_t lcounter; - long long res_used; uint64_t set_aside = 0; s32 batch; bool has_resv_pool; @@ -1150,31 +1200,6 @@ xfs_mod_freecounter( if (rsvd) ASSERT(has_resv_pool); - if (delta > 0) { - /* - * If the reserve pool is depleted, put blocks back into it - * first. Most of the time the pool is full. - */ - if (likely(!has_resv_pool || - mp->m_resblks == mp->m_resblks_avail)) { - percpu_counter_add(counter, delta); - return 0; - } - - spin_lock(&mp->m_sb_lock); - res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); - - if (res_used > delta) { - mp->m_resblks_avail += delta; - } else { - delta -= res_used; - mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(counter, delta); - } - spin_unlock(&mp->m_sb_lock); - return 0; - } - /* * Taking blocks away, need to be more accurate the closer we * are to zero. @@ -1202,7 +1227,7 @@ xfs_mod_freecounter( */ if (has_resv_pool) set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(counter, delta, batch); + percpu_counter_add_batch(counter, -((int64_t)delta), batch); if (__percpu_counter_compare(counter, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ @@ -1214,11 +1239,11 @@ xfs_mod_freecounter( * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - percpu_counter_add(counter, -delta); + percpu_counter_add(counter, delta); if (!has_resv_pool || !rsvd) goto fdblocks_enospc; - lcounter = (long long)mp->m_resblks_avail + delta; + lcounter = (long long)mp->m_resblks_avail - delta; if (lcounter >= 0) { mp->m_resblks_avail = lcounter; spin_unlock(&mp->m_sb_lock); @@ -1364,7 +1389,8 @@ xfs_clear_incompat_log_features( if (!xfs_has_crc(mp) || !xfs_sb_has_incompat_log_feature(&mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_ALL) || - xfs_is_shutdown(mp)) + xfs_is_shutdown(mp) || + !xfs_is_done_with_log_incompat(mp)) return false; /* @@ -1399,9 +1425,20 @@ xfs_clear_incompat_log_features( #define XFS_DELALLOC_BATCH (4096) void xfs_mod_delalloc( - struct xfs_mount *mp, - int64_t delta) + struct xfs_inode *ip, + int64_t data_delta, + int64_t ind_delta) { - percpu_counter_add_batch(&mp->m_delalloc_blks, delta, + struct xfs_mount *mp = ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ip)) { + percpu_counter_add_batch(&mp->m_delalloc_rtextents, + xfs_rtb_to_rtx(mp, data_delta), + XFS_DELALLOC_BATCH); + if (!ind_delta) + return; + data_delta = 0; + } + percpu_counter_add_batch(&mp->m_delalloc_blks, data_delta + ind_delta, XFS_DELALLOC_BATCH); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e880aa48de68..d0567dfbc036 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -195,6 +195,12 @@ typedef struct xfs_mount { * extents or anything related to the rt device. */ struct percpu_counter m_delalloc_blks; + + /* + * RT version of the above. + */ + struct percpu_counter m_delalloc_rtextents; + /* * Global count of allocation btree blocks in use across all AGs. Only * used when perag reservation is enabled. Helps prevent block @@ -292,6 +298,7 @@ typedef struct xfs_mount { #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ +#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -331,19 +338,10 @@ static inline void xfs_add_ ## name (struct xfs_mount *mp) \ __XFS_ADD_FEAT(attr, ATTR) __XFS_HAS_FEAT(nlink, NLINK) __XFS_ADD_FEAT(quota, QUOTA) -__XFS_HAS_FEAT(align, ALIGN) __XFS_HAS_FEAT(dalign, DALIGN) -__XFS_HAS_FEAT(logv2, LOGV2) __XFS_HAS_FEAT(sector, SECTOR) -__XFS_HAS_FEAT(extflg, EXTFLG) __XFS_HAS_FEAT(asciici, ASCIICI) -__XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT) -__XFS_ADD_FEAT(attr2, ATTR2) __XFS_HAS_FEAT(parent, PARENT) -__XFS_ADD_FEAT(projid32, PROJID32) -__XFS_HAS_FEAT(crc, CRC) -__XFS_HAS_FEAT(v3inodes, V3INODES) -__XFS_HAS_FEAT(pquotino, PQUOTINO) __XFS_HAS_FEAT(ftype, FTYPE) __XFS_HAS_FEAT(finobt, FINOBT) __XFS_HAS_FEAT(rmapbt, RMAPBT) @@ -355,6 +353,38 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) +__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) + +/* + * Some features are always on for v5 file systems, allow the compiler to + * eliminiate dead code when building without v4 support. + */ +#define __XFS_HAS_V4_FEAT(name, NAME) \ +static inline bool xfs_has_ ## name (struct xfs_mount *mp) \ +{ \ + return !IS_ENABLED(CONFIG_XFS_SUPPORT_V4) || \ + (mp->m_features & XFS_FEAT_ ## NAME); \ +} + +#define __XFS_ADD_V4_FEAT(name, NAME) \ + __XFS_HAS_V4_FEAT(name, NAME); \ +static inline void xfs_add_ ## name (struct xfs_mount *mp) \ +{ \ + if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { \ + mp->m_features |= XFS_FEAT_ ## NAME; \ + xfs_sb_version_add ## name(&mp->m_sb); \ + } \ +} + +__XFS_HAS_V4_FEAT(align, ALIGN) +__XFS_HAS_V4_FEAT(logv2, LOGV2) +__XFS_HAS_V4_FEAT(extflg, EXTFLG) +__XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT) +__XFS_ADD_V4_FEAT(attr2, ATTR2) +__XFS_ADD_V4_FEAT(projid32, PROJID32) +__XFS_HAS_V4_FEAT(v3inodes, V3INODES) +__XFS_HAS_V4_FEAT(crc, CRC) +__XFS_HAS_V4_FEAT(pquotino, PQUOTINO) /* * Mount features @@ -412,6 +442,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_LARP 9 /* Mount time quotacheck is running */ #define XFS_OPSTATE_QUOTACHECK_RUNNING 10 +/* Do we want to clear log incompat flags? */ +#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 11 +/* Filesystem can use logged extended attributes */ +#define XFS_OPSTATE_USE_LARP 12 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -439,6 +473,8 @@ __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) #else # define xfs_is_quotacheck_running(mp) (false) #endif +__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) +__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -457,7 +493,9 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ - { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" } + { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \ + { (1UL << XFS_OPSTATE_UNSET_LOG_INCOMPAT), "unset_log_incompat" }, \ + { (1UL << XFS_OPSTATE_USE_LARP), "logged_xattrs" } /* * Max and min values for mount-option defined I/O @@ -534,19 +572,30 @@ xfs_fdblocks_unavailable( return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); } -int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, - int64_t delta, bool rsvd); +int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, + uint64_t delta, bool rsvd); +void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, + uint64_t delta); + +static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta, + bool reserved) +{ + return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved); +} + +static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta) +{ + xfs_add_freecounter(mp, &mp->m_fdblocks, delta); +} -static inline int -xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved) +static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta) { - return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved); + return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false); } -static inline int -xfs_mod_frextents(struct xfs_mount *mp, int64_t delta) +static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta) { - return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false); + xfs_add_freecounter(mp, &mp->m_frextents, delta); } extern int xfs_readsb(xfs_mount_t *, int); @@ -566,6 +615,7 @@ struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, void xfs_force_summary_recalc(struct xfs_mount *mp); int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); -void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta); +void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, + int64_t ind_delta); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 0f4cf4170c35..47120b745c47 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -836,8 +836,10 @@ xfs_qm_qino_alloc( ASSERT(xfs_is_shutdown(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); } - if (need_alloc) + if (need_alloc) { + xfs_iunlock(*ipp, XFS_ILOCK_EXCL); xfs_finish_inode_setup(*ipp); + } return error; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index f5993012bf98..6e09dfcd13e2 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -136,7 +136,7 @@ enum { XFS_QM_TRANS_PRJ, XFS_QM_TRANS_DQTYPES }; -#define XFS_QM_TRANS_MAXDQS 2 +#define XFS_QM_TRANS_MAXDQS 5 struct xfs_dquot_acct { struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS]; }; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 85a4ae1a17f6..23d71a55bbc0 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -123,12 +123,6 @@ extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); - -static inline int -xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) -{ - return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); -} bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type); # ifdef CONFIG_XFS_LIVE_HOOKS @@ -188,12 +182,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, } static inline int -xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) -{ - return 0; -} - -static inline int xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks) { @@ -222,9 +210,16 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #endif /* CONFIG_XFS_QUOTA */ static inline int -xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks) +xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); +} + +static inline void +xfs_quota_unreserve_blkres(struct xfs_inode *ip, uint64_t blocks) { - return xfs_quota_reserve_blkres(ip, -blocks); + /* don't return an error as unreserving quotas can't fail */ + xfs_quota_reserve_blkres(ip, -(int64_t)blocks); } extern int xfs_mount_reset_sbqflags(struct xfs_mount *); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 7da0e8f961d3..063a2e00d169 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -430,13 +430,6 @@ xfs_reflink_fill_cow_hole( if (error) return error; - /* - * Allocation succeeded but the requested range was not even partially - * satisfied? Bail out! - */ - if (nimaps == 0) - return -ENOSPC; - convert: return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); @@ -499,13 +492,6 @@ xfs_reflink_fill_delalloc( error = xfs_trans_commit(tp); if (error) return error; - - /* - * Allocation succeeded but the requested range was not even - * partially satisfied? Bail out! - */ - if (nimaps == 0) - return -ENOSPC; } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); @@ -606,10 +592,8 @@ xfs_reflink_cancel_cow_blocks( trace_xfs_reflink_cancel_cow(ip, &del); if (isnullstartblock(del.br_startblock)) { - error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, - &icur, &got, &del); - if (error) - break; + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, + &del); } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); @@ -632,10 +616,7 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &icur, &got, &del); /* Remove the quota reservation */ - error = xfs_quota_unreserve_blkres(ip, - del.br_blockcount); - if (error) - break; + xfs_quota_unreserve_blkres(ip, del.br_blockcount); } else { /* Didn't do anything, push cursor back. */ xfs_iext_prev(ifp, &icur); @@ -731,12 +712,6 @@ xfs_reflink_end_cow_extent( int nmaps; int error; - /* No COW extents? That's easy! */ - if (ifp->if_bytes == 0) { - *offset_fsb = end_fsb; - return 0; - } - resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, XFS_TRANS_RESERVE, &tp); @@ -751,14 +726,6 @@ xfs_reflink_end_cow_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, - XFS_IEXT_REFLINK_END_COW_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_REFLINK_END_COW_CNT); - if (error) - goto out_cancel; - /* * In case of racing, overlapping AIO writes no COW extents might be * left by the time I/O completes for the loser of the race. In that @@ -787,6 +754,11 @@ xfs_reflink_end_cow_extent( del = got; xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb); + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_REFLINK_END_COW_CNT); + if (error) + goto out_cancel; + /* Grab the corresponding mapping in the data fork. */ nmaps = 1; error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, @@ -1283,9 +1255,7 @@ xfs_reflink_remap_extent( if (dmap_written) ++iext_delta; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, iext_delta); + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, iext_delta); if (error) goto out_cancel; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index e66f9bd5de5c..5a7ddfed1bb8 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -695,11 +695,8 @@ xfs_growfs_rt_alloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; @@ -709,8 +706,6 @@ xfs_growfs_rt_alloc( nmap = 1; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, 0, &map, &nmap); - if (!error && nmap < 1) - error = -ENOSPC; if (error) goto out_trans_cancel; /* @@ -957,10 +952,10 @@ xfs_growfs_rt( nargs.tp = tp; /* - * Lock out other callers by grabbing the bitmap inode lock. + * Lock out other callers by grabbing the bitmap and summary + * inode locks and joining them to the transaction. */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_rtbitmap_lock(tp, mp); /* * Update the bitmap inode's size ondisk and incore. We need * to update the incore size so that inode inactivation won't @@ -971,11 +966,6 @@ xfs_growfs_rt( i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); /* - * Get the summary inode into the transaction. - */ - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); - xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); - /* * Update the summary inode's size. We need to update the * incore size so that inode inactivation won't punch what it * thinks are "posteof" blocks. @@ -1142,10 +1132,10 @@ xfs_rtalloc_reinit_frextents( uint64_t val = 0; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, &val); - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); if (error) return error; @@ -1346,6 +1336,8 @@ xfs_bmap_rtalloc( int error; align = xfs_get_extsz_hint(ap->ip); + if (!align) + align = 1; retry: error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, @@ -1382,10 +1374,7 @@ retry: * Lock out modifications to both the RT bitmap and summary inodes */ if (!rtlocked) { - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_rtbitmap_lock(ap->tp, mp); rtlocked = true; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bce020374c5e..27e9f749c4c7 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -43,6 +43,8 @@ #include "xfs_iunlink_item.h" #include "xfs_dahash_test.h" #include "xfs_rtbitmap.h" +#include "xfs_exchmaps_item.h" +#include "xfs_parent.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" @@ -1051,12 +1053,18 @@ xfs_init_percpu_counters( if (error) goto free_fdblocks; - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); + error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); if (error) goto free_delalloc; + error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); + if (error) + goto free_delalloc_rt; + return 0; +free_delalloc_rt: + percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: percpu_counter_destroy(&mp->m_delalloc_blks); free_fdblocks: @@ -1086,6 +1094,9 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_ifree); percpu_counter_destroy(&mp->m_fdblocks); ASSERT(xfs_is_shutdown(mp) || + percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); + percpu_counter_destroy(&mp->m_delalloc_rtextents); + ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); percpu_counter_destroy(&mp->m_frextents); @@ -1579,17 +1590,21 @@ xfs_fs_fill_super( if (error) goto out_free_sb; - /* V4 support is undergoing deprecation. */ - if (!xfs_has_crc(mp)) { -#ifdef CONFIG_XFS_SUPPORT_V4 + /* + * V4 support is undergoing deprecation. + * + * Note: this has to use an open coded m_features check as xfs_has_crc + * always returns false for !CONFIG_XFS_SUPPORT_V4. + */ + if (!(mp->m_features & XFS_FEAT_CRC)) { + if (!IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { + xfs_warn(mp, + "Deprecated V4 format (crc=0) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; + } xfs_warn_once(mp, "Deprecated V4 format (crc=0) will not be supported after September 2030."); -#else - xfs_warn(mp, - "Deprecated V4 format (crc=0) not supported by kernel."); - error = -EINVAL; - goto out_free_sb; -#endif } /* ASCII case insensitivity is undergoing deprecation. */ @@ -1727,6 +1742,14 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_exchange_range(mp)) + xfs_warn(mp, + "EXPERIMENTAL exchange-range feature enabled. Use at your own risk!"); + + if (xfs_has_parent(mp)) + xfs_warn(mp, + "EXPERIMENTAL parent pointer feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1873,11 +1896,7 @@ xfs_remount_ro( xfs_inodegc_stop(mp); /* Free the per-AG metadata reservation pool. */ - error = xfs_fs_unreserve_ag_blocks(mp); - if (error) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } + xfs_fs_unreserve_ag_blocks(mp); /* * Before we sync the metadata, we need to free up the reserve block @@ -2185,8 +2204,32 @@ xfs_init_caches(void) if (!xfs_iunlink_cache) goto out_destroy_attri_cache; + xfs_xmd_cache = kmem_cache_create("xfs_xmd_item", + sizeof(struct xfs_xmd_log_item), + 0, 0, NULL); + if (!xfs_xmd_cache) + goto out_destroy_iul_cache; + + xfs_xmi_cache = kmem_cache_create("xfs_xmi_item", + sizeof(struct xfs_xmi_log_item), + 0, 0, NULL); + if (!xfs_xmi_cache) + goto out_destroy_xmd_cache; + + xfs_parent_args_cache = kmem_cache_create("xfs_parent_args", + sizeof(struct xfs_parent_args), + 0, 0, NULL); + if (!xfs_parent_args_cache) + goto out_destroy_xmi_cache; + return 0; + out_destroy_xmi_cache: + kmem_cache_destroy(xfs_xmi_cache); + out_destroy_xmd_cache: + kmem_cache_destroy(xfs_xmd_cache); + out_destroy_iul_cache: + kmem_cache_destroy(xfs_iunlink_cache); out_destroy_attri_cache: kmem_cache_destroy(xfs_attri_cache); out_destroy_attrd_cache: @@ -2243,6 +2286,9 @@ xfs_destroy_caches(void) * destroy caches. */ rcu_barrier(); + kmem_cache_destroy(xfs_parent_args_cache); + kmem_cache_destroy(xfs_xmd_cache); + kmem_cache_destroy(xfs_xmi_cache); kmem_cache_destroy(xfs_iunlink_cache); kmem_cache_destroy(xfs_attri_cache); kmem_cache_destroy(xfs_attrd_cache); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 3e376d24c7c1..17aee806ec2e 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -25,6 +25,8 @@ #include "xfs_error.h" #include "xfs_health.h" #include "xfs_symlink_remote.h" +#include "xfs_parent.h" +#include "xfs_defer.h" int xfs_readlink( @@ -100,6 +102,7 @@ xfs_symlink( struct xfs_dquot *pdqp = NULL; uint resblks; xfs_ino_t ino; + struct xfs_parent_args *ppargs; *ipp = NULL; @@ -130,18 +133,24 @@ xfs_symlink( /* * The symlink will fit into the inode data fork? - * There can't be any attributes so we get the whole variable part. + * If there are no parent pointers, then there wont't be any attributes. + * So we get the whole variable part, and do not need to reserve extra + * blocks. Otherwise, we need to reserve the blocks. */ - if (pathlen <= XFS_LITINO(mp)) + if (pathlen <= XFS_LITINO(mp) && !xfs_has_parent(mp)) fs_blocks = 0; else fs_blocks = xfs_symlink_blocks(mp, pathlen); - resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + resblks = xfs_symlink_space_res(mp, link_name->len, fs_blocks); + + error = xfs_parent_start(mp, &ppargs); + if (error) + goto out_release_dquots; error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp, pdqp, resblks, &tp); if (error) - goto out_release_dquots; + goto out_parent; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -161,7 +170,7 @@ xfs_symlink( if (!error) error = xfs_init_new_inode(idmap, tp, dp, ino, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, - false, &ip); + xfs_has_parent(mp), &ip); if (error) goto out_trans_cancel; @@ -172,8 +181,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = false; + xfs_trans_ijoin(tp, dp, 0); /* * Also attach the dquot(s) to it, if applicable. @@ -181,8 +189,8 @@ xfs_symlink( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); resblks -= XFS_IALLOC_SPACE_RES(mp); - error = xfs_symlink_write_target(tp, ip, target_path, pathlen, - fs_blocks, resblks); + error = xfs_symlink_write_target(tp, ip, ip->i_ino, target_path, + pathlen, fs_blocks, resblks); if (error) goto out_trans_cancel; resblks -= fs_blocks; @@ -196,6 +204,14 @@ xfs_symlink( goto out_trans_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* Add parent pointer for the new symlink. */ + if (ppargs) { + error = xfs_parent_addname(tp, ppargs, dp, link_name, ip); + if (error) + goto out_trans_cancel; + } + xfs_dir_update_hook(dp, ip, 1, link_name); /* @@ -215,6 +231,9 @@ xfs_symlink( xfs_qm_dqrele(pdqp); *ipp = ip; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_parent_finish(mp, ppargs); return 0; out_trans_cancel: @@ -226,9 +245,12 @@ out_release_inode: * transactions and deadlocks from xfs_inactive. */ if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_finish_inode_setup(ip); xfs_irele(ip); } +out_parent: + xfs_parent_finish(mp, ppargs); out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -250,19 +272,12 @@ out_release_dquots: */ STATIC int xfs_inactive_symlink_rmt( - struct xfs_inode *ip) + struct xfs_inode *ip) { - struct xfs_buf *bp; - int done; - int error; - int i; - xfs_mount_t *mp; - xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; - int nmaps; - int size; - xfs_trans_t *tp; - - mp = ip->i_mount; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + ASSERT(!xfs_need_iread_extents(&ip->i_df)); /* * We're freeing a symlink that has some @@ -286,44 +301,14 @@ xfs_inactive_symlink_rmt( * locked for the second transaction. In the error paths we need it * held so the cancel won't rele it, see below. */ - size = (int)ip->i_disk_size; ip->i_disk_size = 0; VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Find the block(s) so we can inval and unmap them. - */ - done = 0; - nmaps = ARRAY_SIZE(mval); - error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), - mval, &nmaps, 0); - if (error) - goto error_trans_cancel; - /* - * Invalidate the block(s). No validation is done. - */ - for (i = 0; i < nmaps; i++) { - error = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, - &bp); - if (error) - goto error_trans_cancel; - xfs_trans_binval(tp, bp); - } - /* - * Unmap the dead block(s) to the dfops. - */ - error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, &done); + + error = xfs_symlink_remote_truncate(tp, ip); if (error) goto error_trans_cancel; - ASSERT(done); - /* - * Commit the transaction. This first logs the EFI and the inode, then - * rolls and commits the transaction that frees the extents. - */ - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp); if (error) { ASSERT(xfs_is_shutdown(mp)); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 1a963382e5e9..9c7fbaae2717 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -39,6 +39,9 @@ #include "xfs_buf_mem.h" #include "xfs_btree_mem.h" #include "xfs_bmap.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_parent.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index aea97fc074f8..25ff6fe1eb6c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -31,6 +31,8 @@ * pos: file offset, in bytes * bytecount: number of bytes * + * dablk: directory or xattr block offset, in filesystem blocks + * * disize: ondisk file size, in bytes * isize: incore file size, in bytes * @@ -82,11 +84,18 @@ struct xfs_perag; struct xfbtree; struct xfs_btree_ops; struct xfs_bmap_intent; +struct xfs_exchmaps_intent; +struct xfs_exchmaps_req; +struct xfs_exchrange; +struct xfs_getparents; +struct xfs_parent_irec; +struct xfs_attrlist_cursor_kern; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ { XFS_ATTR_SECURE, "SECURE" }, \ - { XFS_ATTR_INCOMPLETE, "INCOMPLETE" } + { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }, \ + { XFS_ATTR_PARENT, "PARENT" } DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -159,7 +168,7 @@ TRACE_EVENT(xlog_intent_recovery_failed, ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __assign_str(name, ops->name); + __assign_str(name); __entry->error = error; ), TP_printk("dev %d:%d optype %s error %d", @@ -1654,7 +1663,6 @@ DEFINE_EVENT(xfs_extent_busy_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len), \ TP_ARGS(mp, agno, agbno, len)) DEFINE_BUSY_EVENT(xfs_extent_busy); -DEFINE_BUSY_EVENT(xfs_extent_busy_enomem); DEFINE_BUSY_EVENT(xfs_extent_busy_force); DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); DEFINE_BUSY_EVENT(xfs_extent_busy_clear); @@ -1905,7 +1913,7 @@ TRACE_EVENT(xfs_alloc_cur_check, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->bno = bno; __entry->len = len; __entry->diff = diff; @@ -1928,6 +1936,7 @@ DECLARE_EVENT_CLASS(xfs_da_class, __field(xfs_dahash_t, hashval) __field(xfs_ino_t, inumber) __field(uint32_t, op_flags) + __field(xfs_ino_t, owner) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -1938,9 +1947,10 @@ DECLARE_EVENT_CLASS(xfs_da_class, __entry->hashval = args->hashval; __entry->inumber = args->inumber; __entry->op_flags = args->op_flags; + __entry->owner = args->owner; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " - "inumber 0x%llx op_flags %s", + "inumber 0x%llx op_flags %s owner 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -1948,7 +1958,8 @@ DECLARE_EVENT_CLASS(xfs_da_class, __entry->namelen, __entry->hashval, __entry->inumber, - __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->owner) ) #define DEFINE_DIR2_EVENT(name) \ @@ -1992,7 +2003,6 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(int, valuelen) __field(xfs_dahash_t, hashval) __field(unsigned int, attr_filter) - __field(unsigned int, attr_flags) __field(uint32_t, op_flags) ), TP_fast_assign( @@ -2004,11 +2014,10 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->valuelen = args->valuelen; __entry->hashval = args->hashval; __entry->attr_filter = args->attr_filter; - __entry->attr_flags = args->attr_flags; __entry->op_flags = args->op_flags; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " - "hashval 0x%x filter %s flags %s op_flags %s", + "hashval 0x%x filter %s op_flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -2018,9 +2027,6 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->hashval, __print_flags(__entry->attr_filter, "|", XFS_ATTR_FILTER_FLAGS), - __print_flags(__entry->attr_flags, "|", - { XATTR_CREATE, "CREATE" }, - { XATTR_REPLACE, "REPLACE" }), __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) ) @@ -2467,7 +2473,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_levels[level].ptr; @@ -2517,7 +2523,7 @@ TRACE_EVENT(xfs_btree_alloc_block, __entry->ino = 0; break; } - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->error = error; if (!error && stat) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { @@ -2561,7 +2567,7 @@ TRACE_EVENT(xfs_btree_free_block, __entry->ino = cur->bc_ino.ip->i_ino; else __entry->ino = 0; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->agbno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); ), @@ -2637,7 +2643,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __assign_str(name, dfp->dfp_ops->name); + __assign_str(name); __entry->intent = dfp->dfp_intent; __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; @@ -2726,7 +2732,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __assign_str(name, dfp->dfp_ops->name); + __assign_str(name); __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; @@ -3062,7 +3068,6 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); -DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); /* refcount tracepoint classes */ @@ -4239,7 +4244,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = cur->bc_ag.afake->af_root; __entry->levels = cur->bc_ag.afake->af_levels; @@ -4268,7 +4273,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp, cur->bc_ino.ip->i_ino); __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp, @@ -4307,7 +4312,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->nr_this_level = nr_this_level; @@ -4345,7 +4350,7 @@ TRACE_EVENT(xfs_btree_bload_block, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->block_idx = block_idx; __entry->nr_blocks = nr_blocks; @@ -4568,7 +4573,7 @@ TRACE_EVENT(xfs_force_shutdown, __entry->dev = mp->m_super->s_dev; __entry->ptag = ptag; __entry->flags = flags; - __assign_str(fname, fname); + __assign_str(fname); __entry->line_num = line_num; ), TP_printk("dev %d:%d tag %s flags %s file %s line_num %d", @@ -4750,7 +4755,7 @@ DECLARE_EVENT_CLASS(xfbtree_freesp_class, ), TP_fast_assign( __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; - __assign_str(btname, cur->bc_ops->name); + __assign_str(btname); __entry->nlevels = cur->bc_nlevels; __entry->fileoff = fileoff; ), @@ -4770,6 +4775,419 @@ DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block); DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block); #endif /* CONFIG_XFS_BTREE_IN_MEM */ +/* exchmaps tracepoints */ +#define XFS_EXCHMAPS_STRINGS \ + { XFS_EXCHMAPS_ATTR_FORK, "ATTRFORK" }, \ + { XFS_EXCHMAPS_SET_SIZES, "SETSIZES" }, \ + { XFS_EXCHMAPS_INO1_WRITTEN, "INO1_WRITTEN" }, \ + { XFS_EXCHMAPS_CLEAR_INO1_REFLINK, "CLEAR_INO1_REFLINK" }, \ + { XFS_EXCHMAPS_CLEAR_INO2_REFLINK, "CLEAR_INO2_REFLINK" }, \ + { __XFS_EXCHMAPS_INO2_SHORTFORM, "INO2_SF" } + +DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1_skip); +DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1); +DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping2); +DEFINE_ITRUNC_EVENT(xfs_exchmaps_update_inode_size); + +#define XFS_EXCHRANGE_INODES \ + { 1, "file1" }, \ + { 2, "file2" } + +DECLARE_EVENT_CLASS(xfs_exchrange_inode_class, + TP_PROTO(struct xfs_inode *ip, int whichfile), + TP_ARGS(ip, whichfile), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, whichfile) + __field(xfs_ino_t, ino) + __field(int, format) + __field(xfs_extnum_t, nex) + __field(int, broot_size) + __field(int, fork_off) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->whichfile = whichfile; + __entry->ino = ip->i_ino; + __entry->format = ip->i_df.if_format; + __entry->nex = ip->i_df.if_nextents; + __entry->fork_off = xfs_inode_fork_boff(ip); + ), + TP_printk("dev %d:%d ino 0x%llx whichfile %s format %s num_extents %llu forkoff 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfile, XFS_EXCHRANGE_INODES), + __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), + __entry->nex, + __entry->fork_off) +) + +#define DEFINE_EXCHRANGE_INODE_EVENT(name) \ +DEFINE_EVENT(xfs_exchrange_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip, int whichfile), \ + TP_ARGS(ip, whichfile)) + +DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_before); +DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_after); +DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error); + +#define XFS_EXCHANGE_RANGE_FLAGS_STRS \ + { XFS_EXCHANGE_RANGE_TO_EOF, "TO_EOF" }, \ + { XFS_EXCHANGE_RANGE_DSYNC , "DSYNC" }, \ + { XFS_EXCHANGE_RANGE_DRY_RUN, "DRY_RUN" }, \ + { XFS_EXCHANGE_RANGE_FILE1_WRITTEN, "F1_WRITTEN" }, \ + { __XFS_EXCHANGE_RANGE_UPD_CMTIME1, "CMTIME1" }, \ + { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" } + +/* file exchange-range tracepoint class */ +DECLARE_EVENT_CLASS(xfs_exchrange_class, + TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1, + struct xfs_inode *ip2), + TP_ARGS(fxr, ip1, ip2), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ip1_ino) + __field(loff_t, ip1_isize) + __field(loff_t, ip1_disize) + __field(xfs_ino_t, ip2_ino) + __field(loff_t, ip2_isize) + __field(loff_t, ip2_disize) + + __field(loff_t, file1_offset) + __field(loff_t, file2_offset) + __field(unsigned long long, length) + __field(unsigned long long, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip1)->i_sb->s_dev; + __entry->ip1_ino = ip1->i_ino; + __entry->ip1_isize = VFS_I(ip1)->i_size; + __entry->ip1_disize = ip1->i_disk_size; + __entry->ip2_ino = ip2->i_ino; + __entry->ip2_isize = VFS_I(ip2)->i_size; + __entry->ip2_disize = ip2->i_disk_size; + + __entry->file1_offset = fxr->file1_offset; + __entry->file2_offset = fxr->file2_offset; + __entry->length = fxr->length; + __entry->flags = fxr->flags; + ), + TP_printk("dev %d:%d flags %s bytecount 0x%llx " + "ino1 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> " + "ino2 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags_u64(__entry->flags, "|", XFS_EXCHANGE_RANGE_FLAGS_STRS), + __entry->length, + __entry->ip1_ino, + __entry->ip1_isize, + __entry->ip1_disize, + __entry->file1_offset, + __entry->ip2_ino, + __entry->ip2_isize, + __entry->ip2_disize, + __entry->file2_offset) +) + +#define DEFINE_EXCHRANGE_EVENT(name) \ +DEFINE_EVENT(xfs_exchrange_class, name, \ + TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1, \ + struct xfs_inode *ip2), \ + TP_ARGS(fxr, ip1, ip2)) +DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep); +DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush); +DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings); + +TRACE_EVENT(xfs_exchmaps_overhead, + TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks, + unsigned long long rmapbt_blocks), + TP_ARGS(mp, bmbt_blocks, rmapbt_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, bmbt_blocks) + __field(unsigned long long, rmapbt_blocks) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->bmbt_blocks = bmbt_blocks; + __entry->rmapbt_blocks = rmapbt_blocks; + ), + TP_printk("dev %d:%d bmbt_blocks 0x%llx rmapbt_blocks 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->bmbt_blocks, + __entry->rmapbt_blocks) +); + +DECLARE_EVENT_CLASS(xfs_exchmaps_estimate_class, + TP_PROTO(const struct xfs_exchmaps_req *req), + TP_ARGS(req), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino1) + __field(xfs_ino_t, ino2) + __field(xfs_fileoff_t, startoff1) + __field(xfs_fileoff_t, startoff2) + __field(xfs_filblks_t, blockcount) + __field(uint64_t, flags) + __field(xfs_filblks_t, ip1_bcount) + __field(xfs_filblks_t, ip2_bcount) + __field(xfs_filblks_t, ip1_rtbcount) + __field(xfs_filblks_t, ip2_rtbcount) + __field(unsigned long long, resblks) + __field(unsigned long long, nr_exchanges) + ), + TP_fast_assign( + __entry->dev = req->ip1->i_mount->m_super->s_dev; + __entry->ino1 = req->ip1->i_ino; + __entry->ino2 = req->ip2->i_ino; + __entry->startoff1 = req->startoff1; + __entry->startoff2 = req->startoff2; + __entry->blockcount = req->blockcount; + __entry->flags = req->flags; + __entry->ip1_bcount = req->ip1_bcount; + __entry->ip2_bcount = req->ip2_bcount; + __entry->ip1_rtbcount = req->ip1_rtbcount; + __entry->ip2_rtbcount = req->ip2_rtbcount; + __entry->resblks = req->resblks; + __entry->nr_exchanges = req->nr_exchanges; + ), + TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) bcount1 0x%llx rtbcount1 0x%llx bcount2 0x%llx rtbcount2 0x%llx resblks 0x%llx nr_exchanges %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino1, __entry->startoff1, + __entry->ino2, __entry->startoff2, + __entry->blockcount, + __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS), + __entry->ip1_bcount, + __entry->ip1_rtbcount, + __entry->ip2_bcount, + __entry->ip2_rtbcount, + __entry->resblks, + __entry->nr_exchanges) +); + +#define DEFINE_EXCHMAPS_ESTIMATE_EVENT(name) \ +DEFINE_EVENT(xfs_exchmaps_estimate_class, name, \ + TP_PROTO(const struct xfs_exchmaps_req *req), \ + TP_ARGS(req)) +DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_initial_estimate); +DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_final_estimate); + +DECLARE_EVENT_CLASS(xfs_exchmaps_intent_class, + TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi), + TP_ARGS(mp, xmi), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino1) + __field(xfs_ino_t, ino2) + __field(uint64_t, flags) + __field(xfs_fileoff_t, startoff1) + __field(xfs_fileoff_t, startoff2) + __field(xfs_filblks_t, blockcount) + __field(xfs_fsize_t, isize1) + __field(xfs_fsize_t, isize2) + __field(xfs_fsize_t, new_isize1) + __field(xfs_fsize_t, new_isize2) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino1 = xmi->xmi_ip1->i_ino; + __entry->ino2 = xmi->xmi_ip2->i_ino; + __entry->flags = xmi->xmi_flags; + __entry->startoff1 = xmi->xmi_startoff1; + __entry->startoff2 = xmi->xmi_startoff2; + __entry->blockcount = xmi->xmi_blockcount; + __entry->isize1 = xmi->xmi_ip1->i_disk_size; + __entry->isize2 = xmi->xmi_ip2->i_disk_size; + __entry->new_isize1 = xmi->xmi_isize1; + __entry->new_isize2 = xmi->xmi_isize2; + ), + TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) isize1 0x%llx newisize1 0x%llx isize2 0x%llx newisize2 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino1, __entry->startoff1, + __entry->ino2, __entry->startoff2, + __entry->blockcount, + __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS), + __entry->isize1, __entry->new_isize1, + __entry->isize2, __entry->new_isize2) +); + +#define DEFINE_EXCHMAPS_INTENT_EVENT(name) \ +DEFINE_EVENT(xfs_exchmaps_intent_class, name, \ + TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi), \ + TP_ARGS(mp, xmi)) +DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_defer); +DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_recover); + +TRACE_EVENT(xfs_exchmaps_delta_nextents_step, + TP_PROTO(struct xfs_mount *mp, + const struct xfs_bmbt_irec *left, + const struct xfs_bmbt_irec *curr, + const struct xfs_bmbt_irec *new, + const struct xfs_bmbt_irec *right, + int delta, unsigned int state), + TP_ARGS(mp, left, curr, new, right, delta, state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_fileoff_t, loff) + __field(xfs_fsblock_t, lstart) + __field(xfs_filblks_t, lcount) + __field(xfs_fileoff_t, coff) + __field(xfs_fsblock_t, cstart) + __field(xfs_filblks_t, ccount) + __field(xfs_fileoff_t, noff) + __field(xfs_fsblock_t, nstart) + __field(xfs_filblks_t, ncount) + __field(xfs_fileoff_t, roff) + __field(xfs_fsblock_t, rstart) + __field(xfs_filblks_t, rcount) + __field(int, delta) + __field(unsigned int, state) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->loff = left->br_startoff; + __entry->lstart = left->br_startblock; + __entry->lcount = left->br_blockcount; + __entry->coff = curr->br_startoff; + __entry->cstart = curr->br_startblock; + __entry->ccount = curr->br_blockcount; + __entry->noff = new->br_startoff; + __entry->nstart = new->br_startblock; + __entry->ncount = new->br_blockcount; + __entry->roff = right->br_startoff; + __entry->rstart = right->br_startblock; + __entry->rcount = right->br_blockcount; + __entry->delta = delta; + __entry->state = state; + ), + TP_printk("dev %d:%d left 0x%llx:0x%llx:0x%llx; curr 0x%llx:0x%llx:0x%llx <- new 0x%llx:0x%llx:0x%llx; right 0x%llx:0x%llx:0x%llx delta %d state 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->loff, __entry->lstart, __entry->lcount, + __entry->coff, __entry->cstart, __entry->ccount, + __entry->noff, __entry->nstart, __entry->ncount, + __entry->roff, __entry->rstart, __entry->rcount, + __entry->delta, __entry->state) +); + +TRACE_EVENT(xfs_exchmaps_delta_nextents, + TP_PROTO(const struct xfs_exchmaps_req *req, int64_t d_nexts1, + int64_t d_nexts2), + TP_ARGS(req, d_nexts1, d_nexts2), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino1) + __field(xfs_ino_t, ino2) + __field(xfs_extnum_t, nexts1) + __field(xfs_extnum_t, nexts2) + __field(int64_t, d_nexts1) + __field(int64_t, d_nexts2) + ), + TP_fast_assign( + int whichfork = xfs_exchmaps_reqfork(req); + + __entry->dev = req->ip1->i_mount->m_super->s_dev; + __entry->ino1 = req->ip1->i_ino; + __entry->ino2 = req->ip2->i_ino; + __entry->nexts1 = xfs_ifork_ptr(req->ip1, whichfork)->if_nextents; + __entry->nexts2 = xfs_ifork_ptr(req->ip2, whichfork)->if_nextents; + __entry->d_nexts1 = d_nexts1; + __entry->d_nexts2 = d_nexts2; + ), + TP_printk("dev %d:%d ino1 0x%llx nexts %llu ino2 0x%llx nexts %llu delta1 %lld delta2 %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino1, __entry->nexts1, + __entry->ino2, __entry->nexts2, + __entry->d_nexts1, __entry->d_nexts2) +); + +DECLARE_EVENT_CLASS(xfs_getparents_rec_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, + const struct xfs_attr_list_context *context, + const struct xfs_getparents_rec *pptr), + TP_ARGS(ip, ppi, context, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, firstu) + __field(unsigned short, reclen) + __field(unsigned int, bufsize) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __string(name, pptr->gpr_name) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->firstu = context->firstu; + __entry->reclen = pptr->gpr_reclen; + __entry->bufsize = ppi->gp_bufsize; + __entry->parent_ino = pptr->gpr_parent.ha_fid.fid_ino; + __entry->parent_gen = pptr->gpr_parent.ha_fid.fid_gen; + __assign_str(name); + ), + TP_printk("dev %d:%d ino 0x%llx firstu %u reclen %u bufsize %u parent_ino 0x%llx parent_gen 0x%x name '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->firstu, + __entry->reclen, + __entry->bufsize, + __entry->parent_ino, + __entry->parent_gen, + __get_str(name)) +) +#define DEFINE_XFS_GETPARENTS_REC_EVENT(name) \ +DEFINE_EVENT(xfs_getparents_rec_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \ + const struct xfs_attr_list_context *context, \ + const struct xfs_getparents_rec *pptr), \ + TP_ARGS(ip, ppi, context, pptr)) +DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_put_listent); +DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_expand_lastrec); + +DECLARE_EVENT_CLASS(xfs_getparents_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, + const struct xfs_attrlist_cursor_kern *cur), + TP_ARGS(ip, ppi, cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, iflags) + __field(unsigned short, oflags) + __field(unsigned int, bufsize) + __field(unsigned int, hashval) + __field(unsigned int, blkno) + __field(unsigned int, offset) + __field(int, initted) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->iflags = ppi->gp_iflags; + __entry->oflags = ppi->gp_oflags; + __entry->bufsize = ppi->gp_bufsize; + __entry->hashval = cur->hashval; + __entry->blkno = cur->blkno; + __entry->offset = cur->offset; + __entry->initted = cur->initted; + ), + TP_printk("dev %d:%d ino 0x%llx iflags 0x%x oflags 0x%x bufsize %u cur_init? %d hashval 0x%x blkno %u offset %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->iflags, + __entry->oflags, + __entry->bufsize, + __entry->initted, + __entry->hashval, + __entry->blkno, + __entry->offset) +) +#define DEFINE_XFS_GETPARENTS_EVENT(name) \ +DEFINE_EVENT(xfs_getparents_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \ + const struct xfs_attrlist_cursor_kern *cur), \ + TP_ARGS(ip, ppi, cur)) +DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_begin); +DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_end); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7350640059cc..828da4ac4316 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -163,7 +163,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); + error = xfs_dec_fdblocks(mp, blocks, rsvd); if (error != 0) return -ENOSPC; tp->t_blk_res += blocks; @@ -210,7 +210,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_frextents(mp, -((int64_t)rtextents)); + error = xfs_dec_frextents(mp, rtextents); if (error) { error = -ENOSPC; goto undo_log; @@ -234,7 +234,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); + xfs_add_fdblocks(mp, blocks); tp->t_blk_res = 0; } return error; @@ -593,38 +593,44 @@ xfs_trans_unreserve_and_mod_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; - bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; - int64_t blkdelta = 0; - int64_t rtxdelta = 0; + int64_t blkdelta = tp->t_blk_res; + int64_t rtxdelta = tp->t_rtx_res; int64_t idelta = 0; int64_t ifreedelta = 0; - int error; - /* calculate deltas */ - if (tp->t_blk_res > 0) - blkdelta = tp->t_blk_res; - if ((tp->t_fdblocks_delta != 0) && - (xfs_has_lazysbcount(mp) || - (tp->t_flags & XFS_TRANS_SB_DIRTY))) + /* + * Calculate the deltas. + * + * t_fdblocks_delta and t_frextents_delta can be positive or negative: + * + * - positive values indicate blocks freed in the transaction. + * - negative values indicate blocks allocated in the transaction + * + * Negative values can only happen if the transaction has a block + * reservation that covers the allocated block. The end result is + * that the calculated delta values must always be positive and we + * can only put back previous allocated or reserved blocks here. + */ + ASSERT(tp->t_blk_res || tp->t_fdblocks_delta >= 0); + if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) { blkdelta += tp->t_fdblocks_delta; + ASSERT(blkdelta >= 0); + } - if (tp->t_rtx_res > 0) - rtxdelta = tp->t_rtx_res; - if ((tp->t_frextents_delta != 0) && - (tp->t_flags & XFS_TRANS_SB_DIRTY)) + ASSERT(tp->t_rtx_res || tp->t_frextents_delta >= 0); + if (tp->t_flags & XFS_TRANS_SB_DIRTY) { rtxdelta += tp->t_frextents_delta; + ASSERT(rtxdelta >= 0); + } - if (xfs_has_lazysbcount(mp) || - (tp->t_flags & XFS_TRANS_SB_DIRTY)) { + if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) { idelta = tp->t_icount_delta; ifreedelta = tp->t_ifree_delta; } /* apply the per-cpu counters */ - if (blkdelta) { - error = xfs_mod_fdblocks(mp, blkdelta, rsvd); - ASSERT(!error); - } + if (blkdelta) + xfs_add_fdblocks(mp, blkdelta); if (idelta) percpu_counter_add_batch(&mp->m_icount, idelta, @@ -633,10 +639,8 @@ xfs_trans_unreserve_and_mod_sb( if (ifreedelta) percpu_counter_add(&mp->m_ifree, ifreedelta); - if (rtxdelta) { - error = xfs_mod_frextents(mp, rtxdelta); - ASSERT(!error); - } + if (rtxdelta) + xfs_add_frextents(mp, rtxdelta); if (!(tp->t_flags & XFS_TRANS_SB_DIRTY)) return; @@ -672,7 +676,6 @@ xfs_trans_unreserve_and_mod_sb( */ ASSERT(mp->m_sb.sb_imax_pct >= 0); ASSERT(mp->m_sb.sb_rextslog >= 0); - return; } /* Add the given log item to the transaction's list of log items. */ @@ -1291,9 +1294,9 @@ xfs_trans_reserve_more_inode( return 0; /* Quota failed, give back the new reservation. */ - xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE); + xfs_add_fdblocks(mp, dblocks); tp->t_blk_res -= dblocks; - xfs_mod_frextents(mp, rtx); + xfs_add_frextents(mp, rtx); tp->t_rtx_res -= rtx; return error; } @@ -1430,6 +1433,8 @@ out_cancel: * The caller must ensure that the on-disk dquots attached to this inode have * already been allocated and initialized. The ILOCKs will be dropped when the * transaction is committed or cancelled. + * + * Caller is responsible for unlocking the inodes manually upon return */ int xfs_trans_alloc_dir( @@ -1460,8 +1465,8 @@ retry: xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, 0); + xfs_trans_ijoin(tp, ip, 0); error = xfs_qm_dqattach_locked(dp, false); if (error) { @@ -1484,6 +1489,9 @@ retry: if (error == -EDQUOT || error == -ENOSPC) { if (!retried) { xfs_trans_cancel(tp); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (dp != ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_blockgc_free_quota(dp, 0); retried = true; goto retry; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 577b535a595c..b368e13424c4 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -379,24 +379,29 @@ xfs_trans_mod_dquot( /* * Given an array of dqtrx structures, lock all the dquots associated and join - * them to the transaction, provided they have been modified. We know that the - * highest number of dquots of one type - usr, grp and prj - involved in a - * transaction is 3 so we don't need to make this very generic. + * them to the transaction, provided they have been modified. */ STATIC void xfs_trans_dqlockedjoin( struct xfs_trans *tp, struct xfs_dqtrx *q) { + unsigned int i; ASSERT(q[0].qt_dquot != NULL); if (q[1].qt_dquot == NULL) { xfs_dqlock(q[0].qt_dquot); xfs_trans_dqjoin(tp, q[0].qt_dquot); - } else { - ASSERT(XFS_QM_TRANS_MAXDQS == 2); + } else if (q[2].qt_dquot == NULL) { xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot); xfs_trans_dqjoin(tp, q[0].qt_dquot); xfs_trans_dqjoin(tp, q[1].qt_dquot); + } else { + xfs_dqlockn(q); + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + if (q[i].qt_dquot == NULL) + break; + xfs_trans_dqjoin(tp, q[i].qt_dquot); + } } } diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 364104e1b38a..ab3d22f662f2 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -17,15 +17,13 @@ #include "xfs_acl.h" #include "xfs_log.h" #include "xfs_xattr.h" +#include "xfs_quota.h" #include <linux/posix_acl_xattr.h> /* * Get permission to use log-assisted atomic exchange of file extents. - * - * Callers must not be running any transactions or hold any inode locks, and - * they must release the permission by calling xlog_drop_incompat_feat - * when they're done. + * Callers must not be running any transactions or hold any ILOCKs. */ static inline int xfs_attr_grab_log_assist( @@ -33,17 +31,8 @@ xfs_attr_grab_log_assist( { int error = 0; - /* - * Protect ourselves from an idle log clearing the logged xattrs log - * incompat feature bit. - */ - xlog_use_incompat_feat(mp->m_log); - - /* - * If log-assisted xattrs are already enabled, the caller can use the - * log assisted swap functions with the log-incompat reference we got. - */ - if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + /* xattr update log intent items are already enabled */ + if (xfs_is_using_logged_xattrs(mp)) return 0; /* @@ -52,31 +41,20 @@ xfs_attr_grab_log_assist( * a V5 filesystem for the superblock field, but we'll require rmap * or reflink to avoid having to deal with really old kernels. */ - if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) { - error = -EOPNOTSUPP; - goto drop_incompat; - } + if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; /* Enable log-assisted xattrs. */ error = xfs_add_incompat_log_feature(mp, XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); if (error) - goto drop_incompat; + return error; + xfs_set_using_logged_xattrs(mp); xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); return 0; -drop_incompat: - xlog_drop_incompat_feat(mp->m_log); - return error; -} - -static inline void -xfs_attr_rele_log_assist( - struct xfs_mount *mp) -{ - xlog_drop_incompat_feat(mp->m_log); } static inline bool @@ -93,17 +71,31 @@ xfs_attr_want_log_assist( /* * Set or remove an xattr, having grabbed the appropriate logging resources - * prior to calling libxfs. + * prior to calling libxfs. Callers of this function are only required to + * initialize the inode, attr_filter, name, namelen, value, and valuelen fields + * of @args. */ int xfs_attr_change( - struct xfs_da_args *args) + struct xfs_da_args *args, + enum xfs_attr_update op) { struct xfs_mount *mp = args->dp->i_mount; - bool use_logging = false; int error; - ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED)); + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_qm_dqattach(args->dp); + if (error) + return error; + + /* + * We have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail as well. + */ + args->op_flags = XFS_DA_OP_OKNOENT; if (xfs_attr_want_log_assist(mp)) { error = xfs_attr_grab_log_assist(mp); @@ -111,14 +103,14 @@ xfs_attr_change( return error; args->op_flags |= XFS_DA_OP_LOGGED; - use_logging = true; } - error = xfs_attr_set(args); + args->owner = args->dp->i_ino; + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + xfs_attr_sethash(args); - if (use_logging) - xfs_attr_rele_log_assist(mp); - return error; + return xfs_attr_set(args, op, args->attr_filter & XFS_ATTR_ROOT); } @@ -145,6 +137,20 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, return args.valuelen; } +static inline enum xfs_attr_update +xfs_xattr_flags_to_op( + int flags, + const void *value) +{ + if (!value) + return XFS_ATTRUPDATE_REMOVE; + if (flags & XATTR_CREATE) + return XFS_ATTRUPDATE_CREATE; + if (flags & XATTR_REPLACE) + return XFS_ATTRUPDATE_REPLACE; + return XFS_ATTRUPDATE_UPSERT; +} + static int xfs_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, @@ -154,7 +160,6 @@ xfs_xattr_set(const struct xattr_handler *handler, struct xfs_da_args args = { .dp = XFS_I(inode), .attr_filter = handler->flags, - .attr_flags = flags, .name = name, .namelen = strlen(name), .value = (void *)value, @@ -162,7 +167,7 @@ xfs_xattr_set(const struct xattr_handler *handler, }; int error; - error = xfs_attr_change(&args); + error = xfs_attr_change(&args, xfs_xattr_flags_to_op(flags, value)); if (!error && (handler->flags & XFS_ATTR_ROOT)) xfs_forget_acl(inode, name); return error; @@ -237,6 +242,7 @@ xfs_xattr_put_listent( int flags, unsigned char *name, int namelen, + void *value, int valuelen) { char *prefix; @@ -244,6 +250,10 @@ xfs_xattr_put_listent( ASSERT(context->count >= 0); + /* Don't expose private xattr namespaces. */ + if (flags & XFS_ATTR_PRIVATE_NSP_MASK) + return; + if (flags & XFS_ATTR_ROOT) { #ifdef CONFIG_XFS_POSIX_ACL if (namelen == SGI_ACL_FILE_SIZE && diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h index cec766cad26c..c3eb858fb59e 100644 --- a/fs/xfs/xfs_xattr.h +++ b/fs/xfs/xfs_xattr.h @@ -6,7 +6,8 @@ #ifndef __XFS_XATTR_H__ #define __XFS_XATTR_H__ -int xfs_attr_change(struct xfs_da_args *args); +enum xfs_attr_update; +int xfs_attr_change(struct xfs_da_args *args, enum xfs_attr_update op); extern const struct xattr_handler * const xfs_xattr_handlers[]; |