1 files changed, 187 insertions, 87 deletions
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 829adbb50837..d1973141d3bb 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -1,7 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/blkdev.h>
 #include <linux/iversion.h>
+#include "compression.h"
 #include "ctree.h"
+#include "delalloc-space.h"
 #include "reflink.h"
 #include "transaction.h"
 
@@ -42,49 +45,131 @@ out:
 	return ret;
 }
 
+static int copy_inline_to_page(struct inode *inode,
+			       const u64 file_offset,
+			       char *inline_data,
+			       const u64 size,
+			       const u64 datal,
+			       const u8 comp_type)
+{
+	const u64 block_size = btrfs_inode_sectorsize(inode);
+	const u64 range_end = file_offset + block_size - 1;
+	const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
+	char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
+	struct extent_changeset *data_reserved = NULL;
+	struct page *page = NULL;
+	int ret;
+
+	ASSERT(IS_ALIGNED(file_offset, block_size));
+
+	/*
+	 * We have flushed and locked the ranges of the source and destination
+	 * inodes, we also have locked the inodes, so we are safe to do a
+	 * reservation here. Also we must not do the reservation while holding
+	 * a transaction open, otherwise we would deadlock.
+	 */
+	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+					   block_size);
+	if (ret)
+		goto out;
+
+	page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
+				   btrfs_alloc_write_mask(inode->i_mapping));
+	if (!page) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	set_page_extent_mapped(page);
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
+			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+			 0, 0, NULL);
+	ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+	if (ret)
+		goto out_unlock;
+
+	if (comp_type == BTRFS_COMPRESS_NONE) {
+		char *map;
+
+		map = kmap(page);
+		memcpy(map, data_start, datal);
+		flush_dcache_page(page);
+		kunmap(page);
+	} else {
+		ret = btrfs_decompress(comp_type, data_start, page, 0,
+				       inline_size, datal);
+		if (ret)
+			goto out_unlock;
+		flush_dcache_page(page);
+	}
+
+	/*
+	 * If our inline data is smaller then the block/page size, then the
+	 * remaining of the block/page is equivalent to zeroes. We had something
+	 * like the following done:
+	 *
+	 * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
+	 * $ sync  # (or fsync)
+	 * $ xfs_io -c "falloc 0 4K" file
+	 * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
+	 *
+	 * So what's in the range [500, 4095] corresponds to zeroes.
+	 */
+	if (datal < block_size) {
+		char *map;
+
+		map = kmap(page);
+		memset(map + datal, 0, block_size - datal);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+
+	SetPageUptodate(page);
+	ClearPageChecked(page);
+	set_page_dirty(page);
+out_unlock:
+	if (page) {
+		unlock_page(page);
+		put_page(page);
+	}
+	if (ret)
+		btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+					     block_size, true);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
+out:
+	extent_changeset_free(data_reserved);
+
+	return ret;
+}
+
 /*
- * Make sure we do not end up inserting an inline extent into a file that has
- * already other (non-inline) extents. If a file has an inline extent it can
- * not have any other extents and the (single) inline extent must start at the
- * file offset 0. Failing to respect these rules will lead to file corruption,
- * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
- *
- * We can have extents that have been already written to disk or we can have
- * dirty ranges still in delalloc, in which case the extent maps and items are
- * created only when we run delalloc, and the delalloc ranges might fall outside
- * the range we are currently locking in the inode's io tree. So we check the
- * inode's i_size because of that (i_size updates are done while holding the
- * i_mutex, which we are holding here).
- * We also check to see if the inode has a size not greater than "datal" but has
- * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
- * protected against such concurrent fallocate calls by the i_mutex).
- *
- * If the file has no extents but a size greater than datal, do not allow the
- * copy because we would need turn the inline extent into a non-inline one (even
- * with NO_HOLES enabled). If we find our destination inode only has one inline
- * extent, just overwrite it with the source inline extent if its size is less
- * than the source extent's size, or we could copy the source inline extent's
- * data into the destination inode's inline extent if the later is greater then
- * the former.
+ * Deal with cloning of inline extents. We try to copy the inline extent from
+ * the source inode to destination inode when possible. When not possible we
+ * copy the inline extent's data into the respective page of the inode.
  */
 static int clone_copy_inline_extent(struct inode *dst,
-				    struct btrfs_trans_handle *trans,
 				    struct btrfs_path *path,
 				    struct btrfs_key *new_key,
 				    const u64 drop_start,
 				    const u64 datal,
 				    const u64 size,
-				    const char *inline_data)
+				    const u8 comp_type,
+				    char *inline_data,
+				    struct btrfs_trans_handle **trans_out)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
 	struct btrfs_root *root = BTRFS_I(dst)->root;
 	const u64 aligned_end = ALIGN(new_key->offset + datal,
 				      fs_info->sectorsize);
+	struct btrfs_trans_handle *trans = NULL;
 	int ret;
 	struct btrfs_key key;
 
-	if (new_key->offset > 0)
-		return -EOPNOTSUPP;
+	if (new_key->offset > 0) {
+		ret = copy_inline_to_page(dst, new_key->offset, inline_data,
+					  size, datal, comp_type);
+		goto out;
+	}
 
 	key.objectid = btrfs_ino(BTRFS_I(dst));
 	key.type = BTRFS_EXTENT_DATA_KEY;
@@ -103,72 +188,75 @@ static int clone_copy_inline_extent(struct inode *dst,
 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 		if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
 		    key.type == BTRFS_EXTENT_DATA_KEY) {
+			/*
+			 * There's an implicit hole at file offset 0, copy the
+			 * inline extent's data to the page.
+			 */
 			ASSERT(key.offset > 0);
-			return -EOPNOTSUPP;
+			ret = copy_inline_to_page(dst, new_key->offset,
+						  inline_data, size, datal,
+						  comp_type);
+			goto out;
 		}
 	} else if (i_size_read(dst) <= datal) {
 		struct btrfs_file_extent_item *ei;
-		u64 ext_len;
 
-		/*
-		 * If the file size is <= datal, make sure there are no other
-		 * extents following (can happen do to an fallocate call with
-		 * the flag FALLOC_FL_KEEP_SIZE).
-		 */
 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				    struct btrfs_file_extent_item);
 		/*
-		 * If it's an inline extent, it can not have other extents
-		 * following it.
+		 * If it's an inline extent replace it with the source inline
+		 * extent, otherwise copy the source inline extent data into
+		 * the respective page at the destination inode.
 		 */
 		if (btrfs_file_extent_type(path->nodes[0], ei) ==
 		    BTRFS_FILE_EXTENT_INLINE)
 			goto copy_inline_extent;
 
-		ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
-		if (ext_len > aligned_end)
-			return -EOPNOTSUPP;
-
-		ret = btrfs_next_item(root, path);
-		if (ret < 0) {
-			return ret;
-		} else if (ret == 0) {
-			btrfs_item_key_to_cpu(path->nodes[0], &key,
-					      path->slots[0]);
-			if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
-			    key.type == BTRFS_EXTENT_DATA_KEY)
-				return -EOPNOTSUPP;
-		}
+		ret = copy_inline_to_page(dst, new_key->offset, inline_data,
+					  size, datal, comp_type);
+		goto out;
 	}
 
 copy_inline_extent:
+	ret = 0;
 	/*
 	 * We have no extent items, or we have an extent at offset 0 which may
 	 * or may not be inlined. All these cases are dealt the same way.
 	 */
 	if (i_size_read(dst) > datal) {
 		/*
-		 * If the destination inode has an inline extent.
-		 * This would require copying the data from the source inline
-		 * extent into the beginning of the destination's inline extent.
-		 * But this is really complex, both extents can be compressed
-		 * or just one of them, which would require decompressing and
-		 * re-compressing data (which could increase the new compressed
-		 * size, not allowing the compressed data to fit anymore in an
-		 * inline extent).
-		 * So just don't support this case for now (it should be rare,
-		 * we are not really saving space when cloning inline extents).
+		 * At the destination offset 0 we have either a hole, a regular
+		 * extent or an inline extent larger then the one we want to
+		 * clone. Deal with all these cases by copying the inline extent
+		 * data into the respective page at the destination inode.
 		 */
-		return -EOPNOTSUPP;
+		ret = copy_inline_to_page(dst, new_key->offset, inline_data,
+					   size, datal, comp_type);
+		goto out;
 	}
 
 	btrfs_release_path(path);
+	/*
+	 * If we end up here it means were copy the inline extent into a leaf
+	 * of the destination inode. We know we will drop or adjust at most one
+	 * extent item in the destination root.
+	 *
+	 * 1 unit - adjusting old extent (we may have to split it)
+	 * 1 unit - add new extent
+	 * 1 unit - inode update
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out;
+	}
 	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
 	if (ret)
-		return ret;
+		goto out;
 	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
 	if (ret)
-		return ret;
+		goto out;
 
 	write_extent_buffer(path->nodes[0], inline_data,
 			    btrfs_item_ptr_offset(path->nodes[0],
@@ -176,8 +264,28 @@ copy_inline_extent:
 			    size);
 	inode_add_bytes(dst, datal);
 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
+out:
+	if (!ret && !trans) {
+		/*
+		 * No transaction here means we copied the inline extent into a
+		 * page of the destination inode.
+		 *
+		 * 1 unit to update inode item
+		 */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+		}
+	}
+	if (ret && trans) {
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+	}
+	if (!ret)
+		*trans_out = trans;
 
-	return 0;
+	return ret;
 }
 
 /**
@@ -196,7 +304,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 		       const u64 destoff, int no_time_update)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path = NULL;
 	struct extent_buffer *leaf;
 	struct btrfs_trans_handle *trans;
@@ -233,6 +340,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 		struct btrfs_key new_key;
 		u64 disko = 0, diskl = 0;
 		u64 datao = 0, datal = 0;
+		u8 comp;
 		u64 drop_start;
 
 		/* Note the key will change type as we walk through the tree */
@@ -275,6 +383,7 @@ process_slot:
 
 		extent = btrfs_item_ptr(leaf, slot,
 					struct btrfs_file_extent_item);
+		comp = btrfs_file_extent_compression(leaf, extent);
 		type = btrfs_file_extent_type(leaf, extent);
 		if (type == BTRFS_FILE_EXTENT_REG ||
 		    type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -369,29 +478,11 @@ process_slot:
 			if (key.offset != 0 || datal > fs_info->sectorsize)
 				return -EUCLEAN;
 
-			/*
-			 * If our extent is inline, we know we will drop or
-			 * adjust at most 1 extent item in the destination root.
-			 *
-			 * 1 - adjusting old extent (we may have to split it)
-			 * 1 - add new extent
-			 * 1 - inode update
-			 */
-			trans = btrfs_start_transaction(root, 3);
-			if (IS_ERR(trans)) {
-				ret = PTR_ERR(trans);
-				goto out;
-			}
-
-			ret = clone_copy_inline_extent(inode, trans, path,
-						       &new_key, drop_start,
-						       datal, size, buf);
-			if (ret) {
-				if (ret != -EOPNOTSUPP)
-					btrfs_abort_transaction(trans, ret);
-				btrfs_end_transaction(trans);
+			ret = clone_copy_inline_extent(inode, path, &new_key,
+						       drop_start, datal, size,
+						       comp, buf, &trans);
+			if (ret)
 				goto out;
-			}
 		}
 
 		btrfs_release_path(path);
@@ -526,6 +617,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 	struct inode *src = file_inode(file_src);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	int ret;
+	int wb_ret;
 	u64 len = olen;
 	u64 bs = fs_info->sb->s_blocksize;
 
@@ -566,6 +658,14 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 	btrfs_double_extent_lock(src, off, inode, destoff, len);
 	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
 	btrfs_double_extent_unlock(src, off, inode, destoff, len);
+
+	/*
+	 * We may have copied an inline extent into a page of the destination
+	 * range, so wait for writeback to complete before truncating pages
+	 * from the page cache. This is a rare case.
+	 */
+	wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+	ret = ret ? ret : wb_ret;
 	/*
 	 * Truncate page cache pages so that future reads will see the cloned
 	 * data immediately and not the previous data.