summaryrefslogtreecommitdiff
path: root/fs/btrfs/send.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/send.c')
-rw-r--r--fs/btrfs/send.c690
1 files changed, 361 insertions, 329 deletions
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e65e6b6600a7..e5c963bb873d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -32,6 +32,7 @@
#include "file-item.h"
#include "ioctl.h"
#include "verity.h"
+#include "lru_cache.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -80,23 +81,23 @@ struct clone_root {
bool found_ref;
};
-#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
-#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+#define SEND_MAX_NAME_CACHE_SIZE 256
/*
- * Limit the root_ids array of struct backref_cache_entry to 12 elements.
- * This makes the size of a cache entry to be exactly 128 bytes on x86_64.
+ * Limit the root_ids array of struct backref_cache_entry to 17 elements.
+ * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
+ * can be satisfied from the kmalloc-192 slab, without wasting any space.
* The most common case is to have a single root for cloning, which corresponds
- * to the send root. Having the user specify more than 11 clone roots is not
+ * to the send root. Having the user specify more than 16 clone roots is not
* common, and in such rare cases we simply don't use caching if the number of
- * cloning roots that lead down to a leaf is more than 12.
+ * cloning roots that lead down to a leaf is more than 17.
*/
-#define SEND_MAX_BACKREF_CACHE_ROOTS 12
+#define SEND_MAX_BACKREF_CACHE_ROOTS 17
/*
* Max number of entries in the cache.
- * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
- * maple tree's internal nodes, is 16K.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
+ * maple tree's internal nodes, is 24K.
*/
#define SEND_MAX_BACKREF_CACHE_SIZE 128
@@ -107,15 +108,31 @@ struct clone_root {
* x86_64).
*/
struct backref_cache_entry {
- /* List to link to the cache's lru list. */
- struct list_head list;
- /* The key for this entry in the cache. */
- u64 key;
+ struct btrfs_lru_cache_entry entry;
u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
/* Number of valid elements in the root_ids array. */
int num_roots;
};
+/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
+static_assert(offsetof(struct backref_cache_entry, entry) == 0);
+
+/*
+ * Max number of entries in the cache that stores directories that were already
+ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
+ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
+ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
+ */
+#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
+
+/*
+ * Max number of entries in the cache that stores directories that were already
+ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
+ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
+ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
+ */
+#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
+
struct send_ctx {
struct file *send_filp;
loff_t send_off;
@@ -174,9 +191,7 @@ struct send_ctx {
struct list_head new_refs;
struct list_head deleted_refs;
- struct radix_tree_root name_cache;
- struct list_head name_cache_list;
- int name_cache_size;
+ struct btrfs_lru_cache name_cache;
/*
* The inode we are currently processing. It's not NULL only when we
@@ -285,13 +300,11 @@ struct send_ctx {
struct rb_root rbtree_new_refs;
struct rb_root rbtree_deleted_refs;
- struct {
- u64 last_reloc_trans;
- struct list_head lru_list;
- struct maple_tree entries;
- /* Number of entries stored in the cache. */
- int size;
- } backref_cache;
+ struct btrfs_lru_cache backref_cache;
+ u64 backref_cache_last_reloc_trans;
+
+ struct btrfs_lru_cache dir_created_cache;
+ struct btrfs_lru_cache dir_utimes_cache;
};
struct pending_dir_move {
@@ -321,21 +334,15 @@ struct orphan_dir_info {
u64 ino;
u64 gen;
u64 last_dir_index_offset;
+ u64 dir_high_seq_ino;
};
struct name_cache_entry {
- struct list_head list;
/*
- * radix_tree has only 32bit entries but we need to handle 64bit inums.
- * We use the lower 32bit of the 64bit inum to store it in the tree. If
- * more then one inum would fall into the same entry, we use radix_list
- * to store the additional entries. radix_list is also used to store
- * entries where two entries have the same inum but different
- * generations.
+ * The key in the entry is an inode number, and the generation matches
+ * the inode's generation.
*/
- struct list_head radix_list;
- u64 ino;
- u64 gen;
+ struct btrfs_lru_cache_entry entry;
u64 parent_ino;
u64 parent_gen;
int ret;
@@ -344,6 +351,9 @@ struct name_cache_entry {
char name[];
};
+/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
+static_assert(offsetof(struct name_cache_entry, entry) == 0);
+
#define ADVANCE 1
#define ADVANCE_ONLY_NEXT -1
@@ -956,14 +966,12 @@ out:
static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
{
int ret;
- struct btrfs_inode_info info;
+ struct btrfs_inode_info info = { 0 };
- if (!gen)
- return -EPERM;
+ ASSERT(gen);
ret = get_inode_info(root, ino, &info);
- if (!ret)
- *gen = info.gen;
+ *gen = info.gen;
return ret;
}
@@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
return 0;
}
-static void empty_backref_cache(struct send_ctx *sctx)
-{
- struct backref_cache_entry *entry;
- struct backref_cache_entry *tmp;
-
- list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
- kfree(entry);
-
- INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
- mtree_destroy(&sctx->backref_cache.entries);
- sctx->backref_cache.size = 0;
-}
-
static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
const u64 **root_ids_ret, int *root_count_ret)
{
@@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
struct send_ctx *sctx = bctx->sctx;
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+ struct btrfs_lru_cache_entry *raw_entry;
struct backref_cache_entry *entry;
- if (sctx->backref_cache.size == 0)
+ if (btrfs_lru_cache_size(&sctx->backref_cache) == 0)
return false;
/*
@@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
* transaction handle or holding fs_info->commit_root_sem, so no need
* to take any lock here.
*/
- if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
- empty_backref_cache(sctx);
+ if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
+ btrfs_lru_cache_clear(&sctx->backref_cache);
return false;
}
- entry = mtree_load(&sctx->backref_cache.entries, key);
- if (!entry)
+ raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0);
+ if (!raw_entry)
return false;
+ entry = container_of(raw_entry, struct backref_cache_entry, entry);
*root_ids_ret = entry->root_ids;
*root_count_ret = entry->num_roots;
- list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
return true;
}
@@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
if (!new_entry)
return;
- new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->entry.gen = 0;
new_entry->num_roots = 0;
ULIST_ITER_INIT(&uiter);
while ((node = ulist_next(root_ids, &uiter)) != NULL) {
@@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
* none of the roots is part of the list of roots from which we are
* allowed to clone. Cache the new entry as it's still useful to avoid
* backref walking to determine which roots have a path to the leaf.
+ *
+ * Also use GFP_NOFS because we're called while holding a transaction
+ * handle or while holding fs_info->commit_root_sem.
*/
-
- if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
- struct backref_cache_entry *lru_entry;
- struct backref_cache_entry *mt_entry;
-
- lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
- struct backref_cache_entry, list);
- mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
- ASSERT(mt_entry == lru_entry);
- list_del(&mt_entry->list);
- kfree(mt_entry);
- sctx->backref_cache.size--;
- }
-
- ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
- new_entry, GFP_NOFS);
+ ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry,
+ GFP_NOFS);
ASSERT(ret == 0 || ret == -ENOMEM);
if (ret) {
/* Caching is optional, no worries. */
@@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
return;
}
- list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
-
/*
* We are called from iterate_extent_inodes() while either holding a
* transaction handle or holding fs_info->commit_root_sem, so no need
* to take any lock here.
*/
- if (sctx->backref_cache.size == 0)
- sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
-
- sctx->backref_cache.size++;
+ if (btrfs_lru_cache_size(&sctx->backref_cache) == 1)
+ sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
}
static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
@@ -1886,7 +1868,8 @@ enum inode_state {
inode_state_did_delete,
};
-static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
+ u64 *send_gen, u64 *parent_gen)
{
int ret;
int left_ret;
@@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
goto out;
left_ret = (info.nlink == 0) ? -ENOENT : ret;
left_gen = info.gen;
+ if (send_gen)
+ *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen);
if (!sctx->parent_root) {
right_ret = -ENOENT;
@@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
goto out;
right_ret = (info.nlink == 0) ? -ENOENT : ret;
right_gen = info.gen;
+ if (parent_gen)
+ *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen);
}
if (!left_ret && !right_ret) {
@@ -1953,14 +1940,15 @@ out:
return ret;
}
-static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
+static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
+ u64 *send_gen, u64 *parent_gen)
{
int ret;
if (ino == BTRFS_FIRST_FREE_OBJECTID)
return 1;
- ret = get_cur_inode_state(sctx, ino, gen);
+ ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
if (ret < 0)
goto out;
@@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
const char *name, int name_len,
u64 *who_ino, u64 *who_gen, u64 *who_mode)
{
- int ret = 0;
- u64 gen;
+ int ret;
+ u64 parent_root_dir_gen;
u64 other_inode = 0;
struct btrfs_inode_info info;
if (!sctx->parent_root)
- goto out;
+ return 0;
- ret = is_inode_existent(sctx, dir, dir_gen);
+ ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen);
if (ret <= 0)
- goto out;
+ return 0;
/*
* If we have a parent root we need to verify that the parent dir was
* not deleted and then re-created, if it was then we have no overwrite
* and we can just unlink this entry.
+ *
+ * @parent_root_dir_gen was set to 0 if the inode does not exist in the
+ * parent root.
*/
- if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_gen(sctx->parent_root, dir, &gen);
- if (ret < 0 && ret != -ENOENT)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
- if (gen != dir_gen)
- goto out;
- }
+ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
+ parent_root_dir_gen != dir_gen)
+ return 0;
ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
&other_inode);
- if (ret < 0 && ret != -ENOENT)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ if (ret == -ENOENT)
+ return 0;
+ else if (ret < 0)
+ return ret;
/*
* Check if the overwritten ref was already processed. If yes, the ref
@@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
is_waiting_for_move(sctx, other_inode)) {
ret = get_inode_info(sctx->parent_root, other_inode, &info);
if (ret < 0)
- goto out;
+ return ret;
- ret = 1;
*who_ino = other_inode;
*who_gen = info.gen;
*who_mode = info.mode;
- } else {
- ret = 0;
+ return 1;
}
-out:
- return ret;
+ return 0;
}
/*
@@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx,
u64 ino, u64 ino_gen,
const char *name, int name_len)
{
- int ret = 0;
- u64 gen;
+ int ret;
u64 ow_inode;
+ u64 ow_gen = 0;
+ u64 send_root_dir_gen;
if (!sctx->parent_root)
- goto out;
+ return 0;
- ret = is_inode_existent(sctx, dir, dir_gen);
+ ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL);
if (ret <= 0)
- goto out;
+ return ret;
- if (dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_gen(sctx->send_root, dir, &gen);
- if (ret < 0 && ret != -ENOENT)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
- if (gen != dir_gen)
- goto out;
- }
+ /*
+ * @send_root_dir_gen was set to 0 if the inode does not exist in the
+ * send root.
+ */
+ if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen)
+ return 0;
/* check if the ref was overwritten by another ref */
ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
&ow_inode);
- if (ret < 0 && ret != -ENOENT)
- goto out;
- if (ret) {
+ if (ret == -ENOENT) {
/* was never and will never be overwritten */
- ret = 0;
- goto out;
+ return 0;
+ } else if (ret < 0) {
+ return ret;
}
- ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
- if (ret < 0)
- goto out;
+ if (ow_inode == ino) {
+ ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
+ if (ret < 0)
+ return ret;
- if (ow_inode == ino && gen == ino_gen) {
- ret = 0;
- goto out;
+ /* It's the same inode, so no overwrite happened. */
+ if (ow_gen == ino_gen)
+ return 0;
}
/*
@@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx,
* inode 'ino' to be orphanized, therefore check if ow_inode matches
* the current inode being processed.
*/
- if ((ow_inode < sctx->send_progress) ||
- (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
- gen == sctx->cur_inode_gen))
- ret = 1;
- else
- ret = 0;
+ if (ow_inode < sctx->send_progress)
+ return 1;
-out:
- return ret;
+ if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) {
+ if (ow_gen == 0) {
+ ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
+ if (ret < 0)
+ return ret;
+ }
+ if (ow_gen == sctx->cur_inode_gen)
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -2285,113 +2264,16 @@ out:
return ret;
}
-/*
- * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
- * so we need to do some special handling in case we have clashes. This function
- * takes care of this with the help of name_cache_entry::radix_list.
- * In case of error, nce is kfreed.
- */
-static int name_cache_insert(struct send_ctx *sctx,
- struct name_cache_entry *nce)
+static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
+ u64 ino, u64 gen)
{
- int ret = 0;
- struct list_head *nce_head;
-
- nce_head = radix_tree_lookup(&sctx->name_cache,
- (unsigned long)nce->ino);
- if (!nce_head) {
- nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
- if (!nce_head) {
- kfree(nce);
- return -ENOMEM;
- }
- INIT_LIST_HEAD(nce_head);
-
- ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
- if (ret < 0) {
- kfree(nce_head);
- kfree(nce);
- return ret;
- }
- }
- list_add_tail(&nce->radix_list, nce_head);
- list_add_tail(&nce->list, &sctx->name_cache_list);
- sctx->name_cache_size++;
-
- return ret;
-}
+ struct btrfs_lru_cache_entry *entry;
-static void name_cache_delete(struct send_ctx *sctx,
- struct name_cache_entry *nce)
-{
- struct list_head *nce_head;
-
- nce_head = radix_tree_lookup(&sctx->name_cache,
- (unsigned long)nce->ino);
- if (!nce_head) {
- btrfs_err(sctx->send_root->fs_info,
- "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
- nce->ino, sctx->name_cache_size);
- }
-
- list_del(&nce->radix_list);
- list_del(&nce->list);
- sctx->name_cache_size--;
-
- /*
- * We may not get to the final release of nce_head if the lookup fails
- */
- if (nce_head && list_empty(nce_head)) {
- radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
- kfree(nce_head);
- }
-}
-
-static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
- u64 ino, u64 gen)
-{
- struct list_head *nce_head;
- struct name_cache_entry *cur;
-
- nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
- if (!nce_head)
+ entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen);
+ if (!entry)
return NULL;
- list_for_each_entry(cur, nce_head, radix_list) {
- if (cur->ino == ino && cur->gen == gen)
- return cur;
- }
- return NULL;
-}
-
-/*
- * Remove some entries from the beginning of name_cache_list.
- */
-static void name_cache_clean_unused(struct send_ctx *sctx)
-{
- struct name_cache_entry *nce;
-
- if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
- return;
-
- while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
- nce = list_entry(sctx->name_cache_list.next,
- struct name_cache_entry, list);
- name_cache_delete(sctx, nce);
- kfree(nce);
- }
-}
-
-static void name_cache_free(struct send_ctx *sctx)
-{
- struct name_cache_entry *nce;
-
- while (!list_empty(&sctx->name_cache_list)) {
- nce = list_entry(sctx->name_cache_list.next,
- struct name_cache_entry, list);
- name_cache_delete(sctx, nce);
- kfree(nce);
- }
+ return container_of(entry, struct name_cache_entry, entry);
}
/*
@@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
{
int ret;
int nce_ret;
- struct name_cache_entry *nce = NULL;
+ struct name_cache_entry *nce;
/*
* First check if we already did a call to this function with the same
@@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
nce = name_cache_search(sctx, ino, gen);
if (nce) {
if (ino < sctx->send_progress && nce->need_later_update) {
- name_cache_delete(sctx, nce);
- kfree(nce);
+ btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry);
nce = NULL;
} else {
- /*
- * Removes the entry from the list and adds it back to
- * the end. This marks the entry as recently used so
- * that name_cache_clean_unused does not remove it.
- */
- list_move_tail(&nce->list, &sctx->name_cache_list);
-
*parent_ino = nce->parent_ino;
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len);
@@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
* This should only happen for the parent dir that we determine in
* record_new_ref_if_needed().
*/
- ret = is_inode_existent(sctx, ino, gen);
+ ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
if (ret < 0)
goto out;
@@ -2497,8 +2371,8 @@ out_cache:
goto out;
}
- nce->ino = ino;
- nce->gen = gen;
+ nce->entry.key = ino;
+ nce->entry.gen = gen;
nce->parent_ino = *parent_ino;
nce->parent_gen = *parent_gen;
nce->name_len = fs_path_len(dest);
@@ -2510,10 +2384,11 @@ out_cache:
else
nce->need_later_update = 1;
- nce_ret = name_cache_insert(sctx, nce);
- if (nce_ret < 0)
+ nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
+ if (nce_ret < 0) {
+ kfree(nce);
ret = nce_ret;
- name_cache_clean_unused(sctx);
+ }
out:
return ret;
@@ -2884,6 +2759,63 @@ out:
}
/*
+ * If the cache is full, we can't remove entries from it and do a call to
+ * send_utimes() for each respective inode, because we might be finishing
+ * processing an inode that is a directory and it just got renamed, and existing
+ * entries in the cache may refer to inodes that have the directory in their
+ * full path - in which case we would generate outdated paths (pre-rename)
+ * for the inodes that the cache entries point to. Instead of prunning the
+ * cache when inserting, do it after we finish processing each inode at
+ * finish_inode_if_needed().
+ */
+static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
+{
+ struct btrfs_lru_cache_entry *entry;
+ int ret;
+
+ entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen);
+ if (entry != NULL)
+ return 0;
+
+ /* Caching is optional, don't fail if we can't allocate memory. */
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return send_utimes(sctx, dir, gen);
+
+ entry->key = dir;
+ entry->gen = gen;
+
+ ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ kfree(entry);
+ return send_utimes(sctx, dir, gen);
+ }
+
+ return 0;
+}
+
+static int trim_dir_utimes_cache(struct send_ctx *sctx)
+{
+ while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) >
+ SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
+ struct btrfs_lru_cache_entry *lru;
+ int ret;
+
+ lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache);
+ ASSERT(lru != NULL);
+
+ ret = send_utimes(sctx, lru->key, lru->gen);
+ if (ret)
+ return ret;
+
+ btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru);
+ }
+
+ return 0;
+}
+
+/*
* Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
* a valid path yet because we did not process the refs yet. So, the inode
* is created as orphan.
@@ -2971,6 +2903,23 @@ out:
return ret;
}
+static void cache_dir_created(struct send_ctx *sctx, u64 dir)
+{
+ struct btrfs_lru_cache_entry *entry;
+ int ret;
+
+ /* Caching is optional, ignore any failures. */
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return;
+
+ entry->key = dir;
+ entry->gen = 0;
+ ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL);
+ if (ret < 0)
+ kfree(entry);
+}
+
/*
* We need some special handling for inodes that get processed before the parent
* directory got created. See process_recorded_refs for details.
@@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
struct btrfs_key di_key;
struct btrfs_dir_item *di;
+ if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0))
+ return 1;
+
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
@@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
di_key.objectid < sctx->send_progress) {
ret = 1;
+ cache_dir_created(sctx, dir);
break;
}
}
@@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
return 0;
}
- return send_create_inode(sctx, sctx->cur_ino);
+ ret = send_create_inode(sctx, sctx->cur_ino);
+
+ if (ret == 0 && S_ISDIR(sctx->cur_inode_mode))
+ cache_dir_created(sctx, sctx->cur_ino);
+
+ return ret;
}
struct recorded_ref {
@@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
odi->ino = dir_ino;
odi->gen = dir_gen;
odi->last_dir_index_offset = 0;
+ odi->dir_high_seq_ino = 0;
rb_link_node(&odi->node, parent, p);
rb_insert_color(&odi->node, &sctx->orphan_dirs);
@@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx,
* We check this by iterating all dir items and checking if the inode behind
* the dir item was already processed.
*/
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
- u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
{
int ret = 0;
int iter_ret = 0;
@@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
struct btrfs_key loc;
struct btrfs_dir_item *di;
struct orphan_dir_info *odi = NULL;
+ u64 dir_high_seq_ino = 0;
+ u64 last_dir_index_offset = 0;
/*
* Don't try to rmdir the top/root subvolume dir.
@@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
if (dir == BTRFS_FIRST_FREE_OBJECTID)
return 0;
+ odi = get_orphan_dir_info(sctx, dir, dir_gen);
+ if (odi && sctx->cur_ino < odi->dir_high_seq_ino)
+ return 0;
+
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
+ if (!odi) {
+ /*
+ * Find the inode number associated with the last dir index
+ * entry. This is very likely the inode with the highest number
+ * of all inodes that have an entry in the directory. We can
+ * then use it to avoid future calls to can_rmdir(), when
+ * processing inodes with a lower number, from having to search
+ * the parent root b+tree for dir index keys.
+ */
+ key.objectid = dir;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ /* Can't happen, the root is never empty. */
+ ASSERT(path->slots[0] > 0);
+ if (WARN_ON(path->slots[0] == 0)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ path->slots[0]--;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) {
+ /* No index keys, dir can be removed. */
+ ret = 1;
+ goto out;
+ }
+
+ di = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+ dir_high_seq_ino = loc.objectid;
+ if (sctx->cur_ino < dir_high_seq_ino) {
+ ret = 0;
+ goto out;
+ }
+
+ btrfs_release_path(path);
+ }
+
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = 0;
-
- odi = get_orphan_dir_info(sctx, dir, dir_gen);
- if (odi)
- key.offset = odi->last_dir_index_offset;
+ key.offset = (odi ? odi->last_dir_index_offset : 0);
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct waiting_dir_move *dm;
@@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+ dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid);
+ last_dir_index_offset = found_key.offset;
+
dm = get_waiting_dir_move(sctx, loc.objectid);
if (dm) {
- odi = add_orphan_dir_info(sctx, dir, dir_gen);
- if (IS_ERR(odi)) {
- ret = PTR_ERR(odi);
- goto out;
- }
- odi->gen = dir_gen;
- odi->last_dir_index_offset = found_key.offset;
dm->rmdir_ino = dir;
dm->rmdir_gen = dir_gen;
ret = 0;
goto out;
}
- if (loc.objectid > send_progress) {
- odi = add_orphan_dir_info(sctx, dir, dir_gen);
- if (IS_ERR(odi)) {
- ret = PTR_ERR(odi);
- goto out;
- }
- odi->gen = dir_gen;
- odi->last_dir_index_offset = found_key.offset;
+ if (loc.objectid > sctx->cur_ino) {
ret = 0;
goto out;
}
@@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
out:
btrfs_free_path(path);
- return ret;
+
+ if (ret)
+ return ret;
+
+ if (!odi) {
+ odi = add_orphan_dir_info(sctx, dir, dir_gen);
+ if (IS_ERR(odi))
+ return PTR_ERR(odi);
+
+ odi->gen = dir_gen;
+ }
+
+ odi->last_dir_index_offset = last_dir_index_offset;
+ odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino);
+
+ return 0;
}
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
@@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
}
gen = odi->gen;
- ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
+ ret = can_rmdir(sctx, rmdir_ino, gen);
if (ret < 0)
goto out;
if (!ret)
@@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
}
finish:
- ret = send_utimes(sctx, pm->ino, pm->gen);
+ ret = cache_dir_utimes(sctx, pm->ino, pm->gen);
if (ret < 0)
goto out;
@@ -3619,7 +3628,7 @@ finish:
if (ret < 0)
goto out;
- ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+ ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
}
@@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* "testdir_2".
*/
list_for_each_entry(cur, &sctx->new_refs, list) {
- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
if (ret < 0)
goto out;
if (ret == inode_state_will_create)
@@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* the source path when performing its rename
* operation.
*/
- if (is_waiting_for_move(sctx, ow_inode)) {
- wdm = get_waiting_dir_move(sctx,
- ow_inode);
- ASSERT(wdm);
+ wdm = get_waiting_dir_move(sctx, ow_inode);
+ if (wdm)
wdm->orphanized = true;
- }
/*
* Make sure we clear our orphanized inode's
@@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* and get instead the orphan name.
*/
nce = name_cache_search(sctx, ow_inode, ow_gen);
- if (nce) {
- name_cache_delete(sctx, nce);
- kfree(nce);
- }
+ if (nce)
+ btrfs_lru_cache_remove(&sctx->name_cache,
+ &nce->entry);
/*
* ow_inode might currently be an ancestor of
@@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* parent directory out of order. But we need to check if this
* did already happen before due to other refs in the same dir.
*/
- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
if (ret < 0)
goto out;
if (ret == inode_state_will_create) {
@@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_create_inode(sctx, cur->dir);
if (ret < 0)
goto out;
+ cache_dir_created(sctx, cur->dir);
}
}
@@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* later, we do this check again and rmdir it then if possible.
* See the use of check_dirs for more details.
*/
- ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
- sctx->cur_ino);
+ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen);
if (ret < 0)
goto out;
if (ret) {
@@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (cur->dir > sctx->cur_ino)
continue;
- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
if (ret < 0)
goto out;
if (ret == inode_state_did_create ||
ret == inode_state_no_change) {
- /* TODO delayed utimes */
- ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+ ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
} else if (ret == inode_state_did_delete &&
cur->dir != last_dir_ino_rm) {
- ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
- sctx->cur_ino);
+ ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
if (ret) {
@@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* boundary in the send buffer. This means that there may be a gap
* between the beginning of the command and the file data.
*/
- data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
+ data_offset = PAGE_ALIGN(sctx->send_size);
if (data_offset > sctx->send_max_size ||
sctx->send_max_size - data_offset < disk_num_bytes) {
ret = -EOVERFLOW;
@@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
sent += size;
}
- if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+ if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
/*
* Always operate only on ranges that are a multiple of the page
* size. This is not only to prevent zeroing parts of a page in
@@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
* it's moved/renamed, therefore we don't need to do it here.
*/
sctx->send_progress = sctx->cur_ino + 1;
- ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+
+ /*
+ * If the current inode is a non-empty directory, delay issuing
+ * the utimes command for it, as it's very likely we have inodes
+ * with an higher number inside it. We want to issue the utimes
+ * command only after adding all dentries to it.
+ */
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0)
+ ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+ else
+ ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+
if (ret < 0)
goto out;
}
out:
+ if (!ret)
+ ret = trim_dir_utimes_cache(sctx);
+
return ret;
}
@@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
int clone_sources_to_rollback = 0;
size_t alloc_size;
int sort_clone_roots = 0;
+ struct btrfs_lru_cache_entry *entry;
+ struct btrfs_lru_cache_entry *tmp;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -8073,10 +8092,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
/*
* Check that we don't overflow at later allocations, we request
* clone_sources_count + 1 items, and compare to unsigned long inside
- * access_ok.
+ * access_ok. Also set an upper limit for allocation size so this can't
+ * easily exhaust memory. Max number of clone sources is about 200K.
*/
- if (arg->clone_sources_count >
- ULONG_MAX / sizeof(struct clone_root) - 1) {
+ if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
ret = -EINVAL;
goto out;
}
@@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
- INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
- INIT_LIST_HEAD(&sctx->name_cache_list);
- INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
- mt_init(&sctx->backref_cache.entries);
+ btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE);
+ btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE);
+ btrfs_lru_cache_init(&sctx->dir_created_cache,
+ SEND_MAX_DIR_CREATED_CACHE_SIZE);
+ /*
+ * This cache is periodically trimmed to a fixed size elsewhere, see
+ * cache_dir_utimes() and trim_dir_utimes_cache().
+ */
+ btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0);
+
+ sctx->pending_dir_moves = RB_ROOT;
+ sctx->waiting_dir_moves = RB_ROOT;
+ sctx->orphan_dirs = RB_ROOT;
+ sctx->rbtree_new_refs = RB_ROOT;
+ sctx->rbtree_deleted_refs = RB_ROOT;
sctx->flags = arg->flags;
@@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->pending_dir_moves = RB_ROOT;
- sctx->waiting_dir_moves = RB_ROOT;
- sctx->orphan_dirs = RB_ROOT;
- sctx->rbtree_new_refs = RB_ROOT;
- sctx->rbtree_deleted_refs = RB_ROOT;
-
sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
arg->clone_sources_count + 1,
GFP_KERNEL);
@@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
if (ret < 0)
goto out;
+ btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) {
+ ret = send_utimes(sctx, entry->key, entry->gen);
+ if (ret < 0)
+ goto out;
+ btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry);
+ }
+
if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
ret = begin_cmd(sctx, BTRFS_SEND_C_END);
if (ret < 0)
@@ -8358,11 +8389,12 @@ out:
kvfree(sctx->send_buf);
kvfree(sctx->verity_descriptor);
- name_cache_free(sctx);
-
close_current_inode(sctx);
- empty_backref_cache(sctx);
+ btrfs_lru_cache_clear(&sctx->name_cache);
+ btrfs_lru_cache_clear(&sctx->backref_cache);
+ btrfs_lru_cache_clear(&sctx->dir_created_cache);
+ btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
kfree(sctx);
}