summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/ctree.h14
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/qgroup.c150
-rw-r--r--fs/btrfs/qgroup.h92
-rw-r--r--fs/btrfs/relocation.c7
-rw-r--r--fs/btrfs/transaction.c1
6 files changed, 265 insertions, 0 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dd0ccc6403b0..007b0e81992a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1209,6 +1209,17 @@ enum {
};
/*
+ * Record swapped tree blocks of a subvolume tree for delayed subtree trace
+ * code. For detail check comment in fs/btrfs/qgroup.c.
+ */
+struct btrfs_qgroup_swapped_blocks {
+ spinlock_t lock;
+ /* RM_EMPTY_ROOT() of above blocks[] */
+ bool swapped;
+ struct rb_root blocks[BTRFS_MAX_LEVEL];
+};
+
+/*
* in ram representation of the tree. extent_root is used for all allocations
* and for the extent tree extent_root root.
*/
@@ -1343,6 +1354,9 @@ struct btrfs_root {
/* Number of active swapfiles */
atomic_t nr_swapfiles;
+ /* Record pairs of swapped blocks for qgroup */
+ struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9e24fdb57453..3d233608fa0f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1220,6 +1220,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
spin_lock_init(&root->root_item_lock);
+ btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 7a6948989655..7166d202b26a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3818,3 +3818,153 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
}
extent_changeset_release(&changeset);
}
+
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+ int i;
+
+ spin_lock_init(&swapped_blocks->lock);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ swapped_blocks->blocks[i] = RB_ROOT;
+ swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+ int i;
+
+ swapped_blocks = &root->swapped_blocks;
+
+ spin_lock(&swapped_blocks->lock);
+ if (!swapped_blocks->swapped)
+ goto out;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ struct rb_root *cur_root = &swapped_blocks->blocks[i];
+ struct btrfs_qgroup_swapped_block *entry;
+ struct btrfs_qgroup_swapped_block *next;
+
+ rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+ node)
+ kfree(entry);
+ swapped_blocks->blocks[i] = RB_ROOT;
+ }
+ swapped_blocks->swapped = false;
+out:
+ spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root: tree root of the subvolume tree get swapped
+ * @bg: block group under balance
+ * @subvol_parent/slot: pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ * BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot: last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct rb_node **cur;
+ struct rb_node *parent = NULL;
+ int level = btrfs_header_level(subvol_parent) - 1;
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+ __func__,
+ btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+ return -EUCLEAN;
+ }
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * @reloc_parent/slot is still before swap, while @block is going to
+ * record the bytenr after swap, so we do the swap here.
+ */
+ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+ block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+ reloc_slot);
+ block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+ block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+ subvol_slot);
+ block->last_snapshot = last_snapshot;
+ block->level = level;
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ block->trace_leaf = true;
+ else
+ block->trace_leaf = false;
+ btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+ /* Insert @block into @blocks */
+ spin_lock(&blocks->lock);
+ cur = &blocks->blocks[level].rb_node;
+ while (*cur) {
+ struct btrfs_qgroup_swapped_block *entry;
+
+ parent = *cur;
+ entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+ node);
+
+ if (entry->subvol_bytenr < block->subvol_bytenr) {
+ cur = &(*cur)->rb_left;
+ } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+ cur = &(*cur)->rb_right;
+ } else {
+ if (entry->subvol_generation !=
+ block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation !=
+ block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found.
+ * Shouldn't happen.
+ *
+ * Marking qgroup inconsistent should be enough
+ * for end users.
+ */
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EEXIST;
+ }
+ kfree(block);
+ goto out_unlock;
+ }
+ }
+ rb_link_node(&block->node, parent, cur);
+ rb_insert_color(&block->node, &blocks->blocks[level]);
+ blocks->swapped = true;
+out_unlock:
+ spin_unlock(&blocks->lock);
+out:
+ if (ret < 0)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 20c6bd5fa701..8dc17020e5be 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -38,6 +40,66 @@
*/
/*
+ * Special performance optimization for balance.
+ *
+ * For balance, we need to swap subtree of subvolume and reloc trees.
+ * In theory, we need to trace all subtree blocks of both subvolume and reloc
+ * trees, since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still need to trace the original subtrees.
+ *
+ * So for balance, we use a delayed subtree tracing, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ * During subtree swap:
+ * O = Old tree blocks
+ * N = New tree blocks
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * NA OB OA OB
+ * / | | \ / | | \
+ * NC ND OE OF OC OD OE OF
+ *
+ * In this case, NA and OA are going to be swapped, record (NA, OA) into
+ * subvolume tree X.
+ *
+ * 2) After subtree swap.
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * OA OB NA OB
+ * / | | \ / | | \
+ * OC OD OE OF NC ND OE OF
+ *
+ * 3a) COW happens for OB
+ * If we are going to COW tree block OB, we check OB's bytenr against
+ * tree X's swapped_blocks structure.
+ * If it doesn't fit any, nothing will happen.
+ *
+ * 3b) COW happens for NA
+ * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ * Then we do subtree scan on both subtrees OA and NA.
+ * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ * Then no matter what we do to subvolume tree X, qgroup numbers will
+ * still be correct.
+ * Then NA's record gets removed from X's swapped_blocks.
+ *
+ * 4) Transaction commit
+ * Any record in X's swapped_blocks gets removed, since there is no
+ * modification to the swapped subtrees, no need to trigger heavy qgroup
+ * subtree rescan for them.
+ */
+
+/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
*/
@@ -48,6 +110,24 @@ struct btrfs_qgroup_extent_record {
struct ulist *old_roots;
};
+struct btrfs_qgroup_swapped_block {
+ struct rb_node node;
+
+ int level;
+ bool trace_leaf;
+
+ /* bytenr/generation of the tree block in subvolume tree after swap */
+ u64 subvol_bytenr;
+ u64 subvol_generation;
+
+ /* bytenr/generation of the tree block in reloc tree after swap */
+ u64 reloc_bytenr;
+ u64 reloc_generation;
+
+ u64 last_snapshot;
+ struct btrfs_key first_key;
+};
+
/*
* Qgroup reservation types:
*
@@ -325,4 +405,16 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+/* btrfs_qgroup_swapped_blocks related functions */
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks);
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot);
+
#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b915f3e157bd..0c528918c844 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1898,6 +1898,13 @@ again:
if (ret < 0)
break;
+ btrfs_node_key_to_cpu(parent, &first_key, slot);
+ ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ rc->block_group, parent, slot,
+ path->nodes[level], path->slots[level],
+ last_snapshot);
+ if (ret < 0)
+ break;
/*
* swap blocks in fs tree and reloc tree.
*/
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fdffe5d61739..0cc6d8b58191 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
+ btrfs_qgroup_clean_swapped_blocks(root);
}
/* We can free old roots now. */