summaryrefslogtreecommitdiff
path: root/fs/btrfs/send.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/send.c')
-rw-r--r--fs/btrfs/send.c541
1 files changed, 403 insertions, 138 deletions
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f53e8049473d..e65e6b6600a7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -27,6 +27,11 @@
#include "compression.h"
#include "xattr.h"
#include "print-tree.h"
+#include "accessors.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "ioctl.h"
+#include "verity.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -34,7 +39,7 @@
* avoid hitting limitations of the backreference walking code (taking a lot of
* time and using too much memory for extents with large number of references).
*/
-#define SEND_MAX_EXTENT_REFS 64
+#define SEND_MAX_EXTENT_REFS 1024
/*
* A fs_path is a helper to dynamically build path names with unknown size.
@@ -71,13 +76,46 @@ struct clone_root {
struct btrfs_root *root;
u64 ino;
u64 offset;
-
- u64 found_refs;
+ u64 num_bytes;
+ bool found_ref;
};
#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+/*
+ * Limit the root_ids array of struct backref_cache_entry to 12 elements.
+ * This makes the size of a cache entry to be exactly 128 bytes on x86_64.
+ * The most common case is to have a single root for cloning, which corresponds
+ * to the send root. Having the user specify more than 11 clone roots is not
+ * common, and in such rare cases we simply don't use caching if the number of
+ * cloning roots that lead down to a leaf is more than 12.
+ */
+#define SEND_MAX_BACKREF_CACHE_ROOTS 12
+
+/*
+ * Max number of entries in the cache.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
+ * maple tree's internal nodes, is 16K.
+ */
+#define SEND_MAX_BACKREF_CACHE_SIZE 128
+
+/*
+ * A backref cache entry maps a leaf to a list of IDs of roots from which the
+ * leaf is accessible and we can use for clone operations.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
+ * x86_64).
+ */
+struct backref_cache_entry {
+ /* List to link to the cache's lru list. */
+ struct list_head list;
+ /* The key for this entry in the cache. */
+ u64 key;
+ u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
+ /* Number of valid elements in the root_ids array. */
+ int num_roots;
+};
+
struct send_ctx {
struct file *send_filp;
loff_t send_off;
@@ -246,6 +284,14 @@ struct send_ctx {
struct rb_root rbtree_new_refs;
struct rb_root rbtree_deleted_refs;
+
+ struct {
+ u64 last_reloc_trans;
+ struct list_head lru_list;
+ struct maple_tree entries;
+ /* Number of entries stored in the cache. */
+ int size;
+ } backref_cache;
};
struct pending_dir_move {
@@ -348,6 +394,7 @@ static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
switch (sctx->proto) {
case 1: return cmd <= BTRFS_SEND_C_MAX_V1;
case 2: return cmd <= BTRFS_SEND_C_MAX_V2;
+ case 3: return cmd <= BTRFS_SEND_C_MAX_V3;
default: return false;
}
}
@@ -1093,7 +1140,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
data_len = btrfs_dir_data_len(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
+ if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
if (name_len > XATTR_NAME_MAX) {
ret = -ENAMETOOLONG;
goto out;
@@ -1236,8 +1283,12 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
- /* Just to check for bugs in backref resolving */
- int found_itself;
+ /* The bytenr the file extent item we are processing refers to. */
+ u64 bytenr;
+ /* The owner (root id) of the data backref for the current extent. */
+ u64 backref_owner;
+ /* The offset of the data backref for the current extent. */
+ u64 backref_offset;
};
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@@ -1266,32 +1317,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
/*
* Called for every backref that is found for the current extent.
- * Results are collected in sctx->clone_roots->ino/offset/found_refs
+ * Results are collected in sctx->clone_roots->ino/offset.
*/
-static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
+ void *ctx_)
{
struct backref_ctx *bctx = ctx_;
- struct clone_root *found;
+ struct clone_root *clone_root;
/* First check if the root is in the list of accepted clone sources */
- found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
- bctx->sctx->clone_roots_cnt,
- sizeof(struct clone_root),
- __clone_root_cmp_bsearch);
- if (!found)
+ clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots,
+ bctx->sctx->clone_roots_cnt,
+ sizeof(struct clone_root),
+ __clone_root_cmp_bsearch);
+ if (!clone_root)
return 0;
- if (found->root == bctx->sctx->send_root &&
+ /* This is our own reference, bail out as we can't clone from it. */
+ if (clone_root->root == bctx->sctx->send_root &&
ino == bctx->cur_objectid &&
- offset == bctx->cur_offset) {
- bctx->found_itself = 1;
- }
+ offset == bctx->cur_offset)
+ return 0;
/*
* Make sure we don't consider clones from send_root that are
* behind the current inode/offset.
*/
- if (found->root == bctx->sctx->send_root) {
+ if (clone_root->root == bctx->sctx->send_root) {
/*
* If the source inode was not yet processed we can't issue a
* clone operation, as the source extent does not exist yet at
@@ -1312,21 +1364,217 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
}
bctx->found++;
- found->found_refs++;
- if (ino < found->ino) {
- found->ino = ino;
- found->offset = offset;
- } else if (found->ino == ino) {
+ clone_root->found_ref = true;
+
+ /*
+ * If the given backref refers to a file extent item with a larger
+ * number of bytes than what we found before, use the new one so that
+ * we clone more optimally and end up doing less writes and getting
+ * less exclusive, non-shared extents at the destination.
+ */
+ if (num_bytes > clone_root->num_bytes) {
+ clone_root->ino = ino;
+ clone_root->offset = offset;
+ clone_root->num_bytes = num_bytes;
+
+ /*
+ * Found a perfect candidate, so there's no need to continue
+ * backref walking.
+ */
+ if (num_bytes >= bctx->extent_len)
+ return BTRFS_ITERATE_EXTENT_INODES_STOP;
+ }
+
+ return 0;
+}
+
+static void empty_backref_cache(struct send_ctx *sctx)
+{
+ struct backref_cache_entry *entry;
+ struct backref_cache_entry *tmp;
+
+ list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
+ kfree(entry);
+
+ INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+ mtree_destroy(&sctx->backref_cache.entries);
+ sctx->backref_cache.size = 0;
+}
+
+static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
+ const u64 **root_ids_ret, int *root_count_ret)
+{
+ struct backref_ctx *bctx = ctx;
+ struct send_ctx *sctx = bctx->sctx;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+ struct backref_cache_entry *entry;
+
+ if (sctx->backref_cache.size == 0)
+ return false;
+
+ /*
+ * If relocation happened since we first filled the cache, then we must
+ * empty the cache and can not use it, because even though we operate on
+ * read-only roots, their leaves and nodes may have been reallocated and
+ * now be used for different nodes/leaves of the same tree or some other
+ * tree.
+ *
+ * We are called from iterate_extent_inodes() while either holding a
+ * transaction handle or holding fs_info->commit_root_sem, so no need
+ * to take any lock here.
+ */
+ if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
+ empty_backref_cache(sctx);
+ return false;
+ }
+
+ entry = mtree_load(&sctx->backref_cache.entries, key);
+ if (!entry)
+ return false;
+
+ *root_ids_ret = entry->root_ids;
+ *root_count_ret = entry->num_roots;
+ list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
+
+ return true;
+}
+
+static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
+ void *ctx)
+{
+ struct backref_ctx *bctx = ctx;
+ struct send_ctx *sctx = bctx->sctx;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ struct backref_cache_entry *new_entry;
+ struct ulist_iterator uiter;
+ struct ulist_node *node;
+ int ret;
+
+ /*
+ * We're called while holding a transaction handle or while holding
+ * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
+ * NOFS allocation.
+ */
+ new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS);
+ /* No worries, cache is optional. */
+ if (!new_entry)
+ return;
+
+ new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->num_roots = 0;
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(root_ids, &uiter)) != NULL) {
+ const u64 root_id = node->val;
+ struct clone_root *root;
+
+ root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots,
+ sctx->clone_roots_cnt, sizeof(struct clone_root),
+ __clone_root_cmp_bsearch);
+ if (!root)
+ continue;
+
+ /* Too many roots, just exit, no worries as caching is optional. */
+ if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
+ kfree(new_entry);
+ return;
+ }
+
+ new_entry->root_ids[new_entry->num_roots] = root_id;
+ new_entry->num_roots++;
+ }
+
+ /*
+ * We may have not added any roots to the new cache entry, which means
+ * none of the roots is part of the list of roots from which we are
+ * allowed to clone. Cache the new entry as it's still useful to avoid
+ * backref walking to determine which roots have a path to the leaf.
+ */
+
+ if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
+ struct backref_cache_entry *lru_entry;
+ struct backref_cache_entry *mt_entry;
+
+ lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
+ struct backref_cache_entry, list);
+ mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
+ ASSERT(mt_entry == lru_entry);
+ list_del(&mt_entry->list);
+ kfree(mt_entry);
+ sctx->backref_cache.size--;
+ }
+
+ ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
+ new_entry, GFP_NOFS);
+ ASSERT(ret == 0 || ret == -ENOMEM);
+ if (ret) {
+ /* Caching is optional, no worries. */
+ kfree(new_entry);
+ return;
+ }
+
+ list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
+
+ /*
+ * We are called from iterate_extent_inodes() while either holding a
+ * transaction handle or holding fs_info->commit_root_sem, so no need
+ * to take any lock here.
+ */
+ if (sctx->backref_cache.size == 0)
+ sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
+
+ sctx->backref_cache.size++;
+}
+
+static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
+ const struct extent_buffer *leaf, void *ctx)
+{
+ const u64 refs = btrfs_extent_refs(leaf, ei);
+ const struct backref_ctx *bctx = ctx;
+ const struct send_ctx *sctx = bctx->sctx;
+
+ if (bytenr == bctx->bytenr) {
+ const u64 flags = btrfs_extent_flags(leaf, ei);
+
+ if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+ return -EUCLEAN;
+
/*
- * same extent found more then once in the same file.
+ * If we have only one reference and only the send root as a
+ * clone source - meaning no clone roots were given in the
+ * struct btrfs_ioctl_send_args passed to the send ioctl - then
+ * it's our reference and there's no point in doing backref
+ * walking which is expensive, so exit early.
*/
- if (found->offset > offset + bctx->extent_len)
- found->offset = offset;
+ if (refs == 1 && sctx->clone_roots_cnt == 1)
+ return -ENOENT;
}
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (refs > SEND_MAX_EXTENT_REFS)
+ return -ENOENT;
+
return 0;
}
+static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
+{
+ const struct backref_ctx *bctx = ctx;
+
+ if (ino == bctx->cur_objectid &&
+ root == bctx->backref_owner &&
+ offset == bctx->backref_offset)
+ return true;
+
+ return false;
+}
+
/*
* Given an inode, offset and extent item, it finds a good clone for a clone
* instruction. Returns -ENOENT when none could be found. The function makes
@@ -1348,79 +1596,36 @@ static int find_extent_clone(struct send_ctx *sctx,
u64 logical;
u64 disk_byte;
u64 num_bytes;
- u64 extent_item_pos;
- u64 flags = 0;
struct btrfs_file_extent_item *fi;
struct extent_buffer *eb = path->nodes[0];
- struct backref_ctx backref_ctx = {0};
+ struct backref_ctx backref_ctx = { 0 };
+ struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
struct clone_root *cur_clone_root;
- struct btrfs_key found_key;
- struct btrfs_path *tmp_path;
- struct btrfs_extent_item *ei;
int compressed;
u32 i;
- tmp_path = alloc_path_for_send();
- if (!tmp_path)
- return -ENOMEM;
+ /*
+ * With fallocate we can get prealloc extents beyond the inode's i_size,
+ * so we don't do anything here because clone operations can not clone
+ * to a range beyond i_size without increasing the i_size of the
+ * destination inode.
+ */
+ if (data_offset >= ino_size)
+ return 0;
- /* We only use this path under the commit sem */
- tmp_path->need_commit_sem = 0;
+ fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ return -ENOENT;
- if (data_offset >= ino_size) {
- /*
- * There may be extents that lie behind the file's size.
- * I at least had this in combination with snapshotting while
- * writing large files.
- */
- ret = 0;
- goto out;
- }
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte == 0)
+ return -ENOENT;
- fi = btrfs_item_ptr(eb, path->slots[0],
- struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(eb, fi);
- if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- ret = -ENOENT;
- goto out;
- }
compressed = btrfs_file_extent_compression(eb, fi);
-
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
- disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte == 0) {
- ret = -ENOENT;
- goto out;
- }
logical = disk_byte + btrfs_file_extent_offset(eb, fi);
- down_read(&fs_info->commit_root_sem);
- ret = extent_from_logical(fs_info, disk_byte, tmp_path,
- &found_key, &flags);
- up_read(&fs_info->commit_root_sem);
-
- if (ret < 0)
- goto out;
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- ret = -EIO;
- goto out;
- }
-
- ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
- struct btrfs_extent_item);
- /*
- * Backreference walking (iterate_extent_inodes() below) is currently
- * too expensive when an extent has a large number of references, both
- * in time spent and used memory. So for now just fallback to write
- * operations instead of clone operations when an extent has more than
- * a certain amount of references.
- */
- if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
- ret = -ENOENT;
- goto out;
- }
- btrfs_release_path(tmp_path);
-
/*
* Setup the clone roots.
*/
@@ -1428,37 +1633,59 @@ static int find_extent_clone(struct send_ctx *sctx,
cur_clone_root = sctx->clone_roots + i;
cur_clone_root->ino = (u64)-1;
cur_clone_root->offset = 0;
- cur_clone_root->found_refs = 0;
+ cur_clone_root->num_bytes = 0;
+ cur_clone_root->found_ref = false;
}
backref_ctx.sctx = sctx;
- backref_ctx.found = 0;
backref_ctx.cur_objectid = ino;
backref_ctx.cur_offset = data_offset;
- backref_ctx.found_itself = 0;
- backref_ctx.extent_len = num_bytes;
+ backref_ctx.bytenr = disk_byte;
+ /*
+ * Use the header owner and not the send root's id, because in case of a
+ * snapshot we can have shared subtrees.
+ */
+ backref_ctx.backref_owner = btrfs_header_owner(eb);
+ backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
* We need to adjust extent_len in this case so that the checks in
- * __iterate_backrefs work.
+ * iterate_backrefs() work.
*/
if (data_offset + num_bytes >= ino_size)
backref_ctx.extent_len = ino_size - data_offset;
+ else
+ backref_ctx.extent_len = num_bytes;
/*
* Now collect all backrefs.
*/
+ backref_walk_ctx.bytenr = disk_byte;
if (compressed == BTRFS_COMPRESS_NONE)
- extent_item_pos = logical - found_key.objectid;
- else
- extent_item_pos = 0;
- ret = iterate_extent_inodes(fs_info, found_key.objectid,
- extent_item_pos, 1, __iterate_backrefs,
- &backref_ctx, false);
+ backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi);
+ backref_walk_ctx.fs_info = fs_info;
+ backref_walk_ctx.cache_lookup = lookup_backref_cache;
+ backref_walk_ctx.cache_store = store_backref_cache;
+ backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
+ backref_walk_ctx.check_extent_item = check_extent_item;
+ backref_walk_ctx.user_ctx = &backref_ctx;
+
+ /*
+ * If have a single clone root, then it's the send root and we can tell
+ * the backref walking code to skip our own backref and not resolve it,
+ * since we can not use it for cloning - the source and destination
+ * ranges can't overlap and in case the leaf is shared through a subtree
+ * due to snapshots, we can't use those other roots since they are not
+ * in the list of clone roots.
+ */
+ if (sctx->clone_roots_cnt == 1)
+ backref_walk_ctx.skip_data_ref = skip_self_data_ref;
+ ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
+ &backref_ctx);
if (ret < 0)
- goto out;
+ return ret;
down_read(&fs_info->commit_root_sem);
if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@@ -1475,37 +1702,42 @@ static int find_extent_clone(struct send_ctx *sctx,
* was already reallocated after the relocation.
*/
up_read(&fs_info->commit_root_sem);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
up_read(&fs_info->commit_root_sem);
- if (!backref_ctx.found_itself) {
- /* found a bug in backref code? */
- ret = -EIO;
- btrfs_err(fs_info,
- "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
- ino, data_offset, disk_byte, found_key.objectid);
- goto out;
- }
-
btrfs_debug(fs_info,
"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
data_offset, ino, num_bytes, logical);
- if (!backref_ctx.found)
+ if (!backref_ctx.found) {
btrfs_debug(fs_info, "no clones found");
+ return -ENOENT;
+ }
cur_clone_root = NULL;
for (i = 0; i < sctx->clone_roots_cnt; i++) {
- if (sctx->clone_roots[i].found_refs) {
- if (!cur_clone_root)
- cur_clone_root = sctx->clone_roots + i;
- else if (sctx->clone_roots[i].root == sctx->send_root)
- /* prefer clones from send_root over others */
- cur_clone_root = sctx->clone_roots + i;
- }
+ struct clone_root *clone_root = &sctx->clone_roots[i];
+
+ if (!clone_root->found_ref)
+ continue;
+
+ /*
+ * Choose the root from which we can clone more bytes, to
+ * minimize write operations and therefore have more extent
+ * sharing at the destination (the same as in the source).
+ */
+ if (!cur_clone_root ||
+ clone_root->num_bytes > cur_clone_root->num_bytes) {
+ cur_clone_root = clone_root;
+ /*
+ * We found an optimal clone candidate (any inode from
+ * any root is fine), so we're done.
+ */
+ if (clone_root->num_bytes >= backref_ctx.extent_len)
+ break;
+ }
}
if (cur_clone_root) {
@@ -1515,8 +1747,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = -ENOENT;
}
-out:
- btrfs_free_path(tmp_path);
return ret;
}
@@ -1596,13 +1826,17 @@ static int gen_unique_name(struct send_ctx *sctx,
return -ENOMEM;
while (1) {
+ struct fscrypt_str tmp_name;
+
len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
ino, gen, idx);
ASSERT(len < sizeof(tmp));
+ tmp_name.name = tmp;
+ tmp_name.len = strlen(tmp);
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
- tmp, strlen(tmp), 0);
+ &tmp_name, 0);
btrfs_release_path(path);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -1622,7 +1856,7 @@ static int gen_unique_name(struct send_ctx *sctx,
di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
path, BTRFS_FIRST_FREE_OBJECTID,
- tmp, strlen(tmp), 0);
+ &tmp_name, 0);
btrfs_release_path(path);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -1752,13 +1986,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_path *path;
+ struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
- di = btrfs_lookup_dir_item(NULL, root, path,
- dir, name, name_len, 0);
+ di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
@@ -5702,6 +5936,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
u64 ext_len;
u64 clone_len;
u64 clone_data_offset;
+ bool crossed_src_i_size = false;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
@@ -5759,8 +5994,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
if (key.offset >= clone_src_i_size)
break;
- if (key.offset + ext_len > clone_src_i_size)
+ if (key.offset + ext_len > clone_src_i_size) {
ext_len = clone_src_i_size - key.offset;
+ crossed_src_i_size = true;
+ }
clone_data_offset = btrfs_file_extent_offset(leaf, ei);
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5821,6 +6058,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
+ } else if (crossed_src_i_size && clone_len < len) {
+ /*
+ * If we are at i_size of the clone source inode and we
+ * can not clone from it, terminate the loop. This is
+ * to avoid sending two write operations, one with a
+ * length matching clone_len and the final one after
+ * this loop with a length of len - clone_len.
+ *
+ * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+ * was passed to the send ioctl), this helps avoid
+ * sending an encoded write for an offset that is not
+ * sector size aligned, in case the i_size of the source
+ * inode is not sector size aligned. That will make the
+ * receiver fallback to decompression of the data and
+ * writing it using regular buffered IO, therefore while
+ * not incorrect, it's not optimal due decompression and
+ * possible re-compression at the receiver.
+ */
+ break;
} else {
ret = send_extent_data(sctx, dst_path, offset,
clone_len);
@@ -6470,7 +6726,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret < 0)
goto out;
}
- if (sctx->cur_inode_needs_verity) {
+
+ if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY)
+ && sctx->cur_inode_needs_verity) {
ret = process_verity(sctx);
if (ret < 0)
goto out;
@@ -6666,17 +6924,19 @@ static int changed_inode(struct send_ctx *sctx,
/*
* First, process the inode as if it was deleted.
*/
- sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = false;
- sctx->cur_inode_deleted = true;
- sctx->cur_inode_size = btrfs_inode_size(
- sctx->right_path->nodes[0], right_ii);
- sctx->cur_inode_mode = btrfs_inode_mode(
- sctx->right_path->nodes[0], right_ii);
- ret = process_all_refs(sctx,
- BTRFS_COMPARE_TREE_DELETED);
- if (ret < 0)
- goto out;
+ if (old_nlinks > 0) {
+ sctx->cur_inode_gen = right_gen;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->right_path->nodes[0], right_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->right_path->nodes[0], right_ii);
+ ret = process_all_refs(sctx,
+ BTRFS_COMPARE_TREE_DELETED);
+ if (ret < 0)
+ goto out;
+ }
/*
* Now process the inode as if it was new.
@@ -7837,6 +8097,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
+ INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+ mt_init(&sctx->backref_cache.entries);
+
sctx->flags = arg->flags;
if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
@@ -7875,7 +8138,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
if (sctx->proto >= 2) {
u32 send_buf_num_pages;
- sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+ sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
sctx->send_buf = vmalloc(sctx->send_max_size);
if (!sctx->send_buf) {
ret = -ENOMEM;
@@ -8099,6 +8362,8 @@ out:
close_current_inode(sctx);
+ empty_backref_cache(sctx);
+
kfree(sctx);
}