summaryrefslogtreecommitdiff
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c181
1 files changed, 146 insertions, 35 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41b718cfea40..c9a3036c23bf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -42,6 +42,7 @@
#include "discard.h"
#include "space-info.h"
#include "zoned.h"
+#include "subpage.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -440,6 +441,74 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
return ret;
}
+static int csum_one_extent_buffer(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u8 result[BTRFS_CSUM_SIZE];
+ int ret;
+
+ ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
+ offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE) == 0);
+ csum_tree_block(eb, result);
+
+ if (btrfs_header_level(eb))
+ ret = btrfs_check_node(eb);
+ else
+ ret = btrfs_check_leaf_full(eb);
+
+ if (ret < 0) {
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info,
+ "block=%llu write time tree block corruption detected",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return ret;
+ }
+ write_extent_buffer(eb, result, 0, fs_info->csum_size);
+
+ return 0;
+}
+
+/* Checksum all dirty extent buffers in one bio_vec */
+static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
+ struct bio_vec *bvec)
+{
+ struct page *page = bvec->bv_page;
+ u64 bvec_start = page_offset(page) + bvec->bv_offset;
+ u64 cur;
+ int ret = 0;
+
+ for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
+ cur += fs_info->nodesize) {
+ struct extent_buffer *eb;
+ bool uptodate;
+
+ eb = find_extent_buffer(fs_info, cur);
+ uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
+ fs_info->nodesize);
+
+ /* A dirty eb shouldn't disappear from buffer_radix */
+ if (WARN_ON(!eb))
+ return -EUCLEAN;
+
+ if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+ if (WARN_ON(!uptodate)) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+
+ ret = csum_one_extent_buffer(eb);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ }
+ return ret;
+}
+
/*
* Checksum a dirty tree block before IO. This has extra checks to make sure
* we only fill in the checksum field in the first page of a multi-page block.
@@ -450,9 +519,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
struct page *page = bvec->bv_page;
u64 start = page_offset(page);
u64 found_start;
- u8 result[BTRFS_CSUM_SIZE];
struct extent_buffer *eb;
- int ret;
+
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
@@ -474,28 +544,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
if (WARN_ON(!PageUptodate(page)))
return -EUCLEAN;
- ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
- offsetof(struct btrfs_header, fsid),
- BTRFS_FSID_SIZE) == 0);
-
- csum_tree_block(eb, result);
-
- if (btrfs_header_level(eb))
- ret = btrfs_check_node(eb);
- else
- ret = btrfs_check_leaf_full(eb);
-
- if (ret < 0) {
- btrfs_print_tree(eb, 0);
- btrfs_err(fs_info,
- "block=%llu write time tree block corruption detected",
- eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- return ret;
- }
- write_extent_buffer(eb, result, 0, fs_info->csum_size);
-
- return 0;
+ return csum_one_extent_buffer(eb);
}
static int check_tree_block_fsid(struct extent_buffer *eb)
@@ -992,14 +1041,48 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
static int btree_set_page_dirty(struct page *page)
{
#ifdef DEBUG
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
struct extent_buffer *eb;
+ int cur_bit = 0;
+ u64 page_start = page_offset(page);
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ BUG_ON(!PagePrivate(page));
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(!atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+ return __set_page_dirty_nobuffers(page);
+ }
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ ASSERT(subpage->dirty_bitmap);
+ while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
+ unsigned long flags;
+ u64 cur;
+ u16 tmp = (1 << cur_bit);
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ if (!(tmp & subpage->dirty_bitmap)) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ cur_bit++;
+ continue;
+ }
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ cur = page_start + cur_bit * fs_info->sectorsize;
- BUG_ON(!PagePrivate(page));
- eb = (struct extent_buffer *)page->private;
- BUG_ON(!eb);
- BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(!atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ eb = find_extent_buffer(fs_info, cur);
+ ASSERT(eb);
+ ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ ASSERT(atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+ free_extent_buffer(eb);
+
+ cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
+ }
#endif
return __set_page_dirty_nobuffers(page);
}
@@ -1807,14 +1890,21 @@ static int cleaner_kthread(void *arg)
btrfs_run_defrag_inodes(fs_info);
/*
- * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
+ * Acquires fs_info->reclaim_bgs_lock to avoid racing
* with relocation (btrfs_relocate_chunk) and relocation
* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
- * after acquiring fs_info->delete_unused_bgs_mutex. So we
+ * after acquiring fs_info->reclaim_bgs_lock. So we
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
* unused block groups.
*/
btrfs_delete_unused_bgs(fs_info);
+
+ /*
+ * Reclaim block groups in the reclaim_bgs list after we deleted
+ * all unused block_groups. This possibly gives us some more free
+ * space.
+ */
+ btrfs_reclaim_bgs(fs_info);
sleep:
clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
@@ -2387,8 +2477,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
- btrfs_init_devices_late(fs_info);
}
+ /* Initialize fs_info for all devices in any case */
+ btrfs_init_devices_late(fs_info);
/* If IGNOREDATACSUMS is set don't bother reading the csum root. */
if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
@@ -2792,7 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->treelog_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
- mutex_init(&fs_info->delete_unused_bgs_mutex);
+ mutex_init(&fs_info->reclaim_bgs_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
@@ -2802,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
+ INIT_LIST_HEAD(&fs_info->reclaim_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -2890,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->swapfile_pins = RB_ROOT;
fs_info->send_in_progress = 0;
+
+ fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
+ INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
}
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
@@ -3009,6 +3104,21 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
}
}
+ /*
+ * btrfs_find_orphan_roots() is responsible for finding all the dead
+ * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
+ * them into the fs_info->fs_roots_radix tree. This must be done before
+ * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
+ * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
+ * item before the root's tree is deleted - this means that if we unmount
+ * or crash before the deletion completes, on the next mount we will not
+ * delete what remains of the tree because the orphan item does not
+ * exists anymore, which is what tells us we have a pending deletion.
+ */
+ ret = btrfs_find_orphan_roots(fs_info);
+ if (ret)
+ goto out;
+
ret = btrfs_cleanup_fs_roots(fs_info);
if (ret)
goto out;
@@ -3068,7 +3178,6 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
}
}
- ret = btrfs_find_orphan_roots(fs_info);
out:
return ret;
}
@@ -4234,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
+ cancel_work_sync(&fs_info->reclaim_bgs_work);
+
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);