diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-10 20:27:40 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-10 20:27:40 +0300 |
commit | affc5af36bbb62073b6aaa4f4459b38937ff5331 (patch) | |
tree | 2267430b2cc94ce7c8e87ac362bcc28e89975db1 /fs/btrfs/inode.c | |
parent | 12958e9c4c8e93ef694c10960c78453edf21526e (diff) | |
parent | e94dfb7a2935cb91faca88bf7136177d1ce0dda8 (diff) | |
download | linux-affc5af36bbb62073b6aaa4f4459b38937ff5331.tar.xz |
Merge tag 'for-6.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"There are no exciting changes for users, it's been mostly API
conversions and some fixes or refactoring.
The mount API conversion is a base for future improvements that would
come with VFS. Metadata processing has been converted to folios, not
yet enabling the large folios but it's one patch away once everything
gets tested enough.
Core changes:
- convert extent buffers to folios:
- direct API conversion where possible
- performance can drop by a few percent on metadata heavy
workloads, the folio sizes are not constant and the calculations
add up in the item helpers
- both regular and subpage modes
- data cannot be converted yet, we need to port that to iomap and
there are some other generic changes required
- convert mount to the new API, should not be user visible:
- options deprecated long time ago have been removed: inode_cache,
recovery
- the new logic that splits mount to two phases slightly changes
timing of device scanning for multi-device filesystems
- LSM options will now work (like for selinux)
- convert delayed nodes radix tree to xarray, preserving the
preload-like logic that still allows to allocate with GFP_NOFS
- more validation of sysfs value of scrub_speed_max
- refactor chunk map structure, reduce size and improve performance
- extent map refactoring, smaller data structures, improved
performance
- reduce size of struct extent_io_tree, embedded in several
structures
- temporary pages used for compression are cached and attached to a
shrinker, this may slightly improve performance
- in zoned mode, remove redirty extent buffer tracking, zeros are
written in case an out-of-order is detected and proper data are
written to the actual write pointer
- cleanups, refactoring, error message improvements, updated tests
- verify and update branch name or tag
- remove unwanted text"
* tag 'for-6.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (89 commits)
btrfs: pass btrfs_io_geometry into btrfs_max_io_len
btrfs: pass struct btrfs_io_geometry to set_io_stripe
btrfs: open code set_io_stripe for RAID56
btrfs: change block mapping to switch/case in btrfs_map_block
btrfs: factor out block mapping for single profiles
btrfs: factor out block mapping for RAID5/6
btrfs: reduce scope of data_stripes in btrfs_map_block
btrfs: factor out block mapping for RAID10
btrfs: factor out block mapping for DUP profiles
btrfs: factor out RAID1 block mapping
btrfs: factor out block-mapping for RAID0
btrfs: re-introduce struct btrfs_io_geometry
btrfs: factor out helper for single device IO check
btrfs: migrate btrfs_repair_io_failure() to folio interfaces
btrfs: migrate eb_bitmap_offset() to folio interfaces
btrfs: migrate various end io functions to folios
btrfs: migrate subpage code to folio interfaces
btrfs: migrate get_eb_page_index() and get_eb_offset_in_page() to folios
btrfs: don't double put our subpage reference in alloc_extent_buffer
btrfs: cleanup metadata page pointer usage
...
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 153 |
1 files changed, 98 insertions, 55 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fea464b2a54e..809b11472a80 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -114,6 +114,15 @@ struct data_reloc_warn { int mirror_num; }; +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -447,8 +456,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, - offset, bytes); + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, + page_folio(page), offset, bytes); put_page(page); } @@ -1037,7 +1046,7 @@ free_pages: if (pages) { for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); - put_page(pages[i]); + btrfs_free_compr_page(pages[i]); } kfree(pages); } @@ -1052,7 +1061,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); - put_page(async_extent->pages[i]); + btrfs_free_compr_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; @@ -2793,7 +2802,7 @@ out_page: PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -2848,7 +2857,7 @@ int btrfs_writepage_cow_fixup(struct page *page) * page->mapping outside of the page lock. */ ihold(inode); - btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); + btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->page = page; @@ -3118,7 +3127,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, + unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3796,7 +3805,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in delayed_nodes_tree. + * in the delayed_nodes xarray. */ if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, @@ -4725,7 +4734,7 @@ again: /* * We unlock the page after the io is completed and then re-lock it * above. release_folio() could have come in between that and cleared - * PagePrivate(), but left the page in the mapping. Set the page mapped + * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ ret = set_page_extent_mapped(page); @@ -4767,9 +4776,10 @@ again: memzero_page(page, (block_start - page_offset(page)) + offset, len); } - btrfs_page_clear_checked(fs_info, page, block_start, - block_end + 1 - block_start); - btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); + btrfs_folio_clear_checked(fs_info, page_folio(page), block_start, + block_end + 1 - block_start); + btrfs_folio_set_dirty(fs_info, page_folio(page), block_start, + block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -4889,7 +4899,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; err = maybe_insert_hole(inode, cur_offset, hole_size); @@ -4917,7 +4927,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; - hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = btrfs_get_fs_generation(fs_info); err = btrfs_replace_extent_map_range(inode, hole_em, true); @@ -6217,6 +6226,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, inode->i_generation = BTRFS_I(inode)->generation; /* + * We don't have any capability xattrs set here yet, shortcut any + * queries for the xattrs here. If we add them later via the inode + * security init path or any other path this flag will be cleared. + */ + set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + + /* * Subvolumes don't inherit flags from their parent directory. * Originally this was probably by accident, but we probably can't * change it now without compatibility issues. @@ -7258,13 +7274,11 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, em->orig_block_len = orig_block_len; em->ram_bytes = ram_bytes; em->generation = -1; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - if (type == BTRFS_ORDERED_PREALLOC) { - set_bit(EXTENT_FLAG_FILLING, &em->flags); - } else if (type == BTRFS_ORDERED_COMPRESSED) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } + em->flags |= EXTENT_FLAG_PINNED; + if (type == BTRFS_ORDERED_PREALLOC) + em->flags |= EXTENT_FLAG_FILLING; + else if (type == BTRFS_ORDERED_COMPRESSED) + extent_map_set_compression(em, compress_type); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { @@ -7304,10 +7318,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * just use the extent. * */ - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + if ((em->flags & EXTENT_FLAG_PREALLOC) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) type = BTRFS_ORDERED_PREALLOC; else type = BTRFS_ORDERED_NOCOW; @@ -7542,7 +7556,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + if (extent_map_is_compressed(em) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); /* @@ -7638,7 +7652,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * that, since we have locked only the parts we are performing I/O in. */ if ((em->block_start == EXTENT_MAP_HOLE) || - (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { @@ -7851,13 +7865,14 @@ static void btrfs_readahead(struct readahead_control *rac) static void wait_subpage_spinlock(struct page *page) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) return; - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + subpage = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, @@ -7995,7 +8010,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { + if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. @@ -8004,7 +8019,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, */ goto next; } - btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); + btrfs_folio_clear_ordered(fs_info, folio, cur, range_len); /* * IO on this page will never be started, so we need to account @@ -8074,7 +8089,7 @@ next: * did something wrong. */ ASSERT(!folio_test_ordered(folio)); - btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); + btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); clear_page_extent_mapped(&folio->page); @@ -8098,6 +8113,7 @@ next: vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; + struct folio *folio = page_folio(page); struct inode *inode = file_inode(vmf->vma->vm_file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -8114,6 +8130,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) u64 page_end; u64 end; + ASSERT(folio_order(folio) == 0); + reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); @@ -8217,9 +8235,9 @@ again: if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); - btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); - btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); + btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); @@ -8462,10 +8480,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; + struct extent_io_tree *file_extent_tree = NULL; + + /* Self tests may pass a NULL fs_info. */ + if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) { + file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); + if (!file_extent_tree) + return NULL; + } ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); - if (!ei) + if (!ei) { + kfree(file_extent_tree); return NULL; + } ei->root = NULL; ei->generation = 0; @@ -8501,10 +8529,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); + + /* This io tree sets the valid inode. */ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; - extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); + + ei->file_extent_tree = file_extent_tree; + if (file_extent_tree) { + extent_io_tree_init(fs_info, ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class); + } mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; @@ -8521,12 +8557,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) void btrfs_test_destroy_inode(struct inode *inode) { btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif void btrfs_free_inode(struct inode *inode) { + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -9632,7 +9670,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); @@ -9785,7 +9823,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ - btrfs_page_set_writeback(fs_info, page, start, len); + /* This is for data, which doesn't yet support larger folio. */ + ASSERT(folio_order(page_folio(page)) == 0); + btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); put_page(page); index++; } @@ -9994,7 +10034,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - ret = btrfs_alloc_page_array(nr_pages, pages); + ret = btrfs_alloc_page_array(nr_pages, pages, 0); if (ret) { ret = -ENOMEM; goto out; @@ -10113,12 +10153,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + (em->flags & EXTENT_FLAG_PREALLOC)) { disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + } else if (extent_map_is_compressed(em)) { disk_bytenr = em->block_start; /* * Bail if the buffer isn't large enough to return the whole @@ -10133,7 +10173,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, - em->compress_type); + extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; @@ -10564,6 +10604,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; struct extent_map *em = NULL; + struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, @@ -10680,7 +10721,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + if (extent_map_is_compressed(em)) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; @@ -10703,13 +10744,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out; } - em = btrfs_get_chunk_map(fs_info, logical_block_start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(map)) { + ret = PTR_ERR(map); goto out; } - if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { btrfs_warn(fs_info, "swapfile must have single data profile"); ret = -EINVAL; @@ -10717,23 +10758,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, } if (device == NULL) { - device = em->map_lookup->stripes[0].dev; + device = map->stripes[0].dev; ret = btrfs_add_swapfile_pin(inode, device, false); if (ret == 1) ret = 0; else if (ret) goto out; - } else if (device != em->map_lookup->stripes[0].dev) { + } else if (device != map->stripes[0].dev) { btrfs_warn(fs_info, "swapfile must be on one device"); ret = -EINVAL; goto out; } - physical_block_start = (em->map_lookup->stripes[0].physical + - (logical_block_start - em->start)); - len = min(len, em->len - (logical_block_start - em->start)); - free_extent_map(em); - em = NULL; + physical_block_start = (map->stripes[0].physical + + (logical_block_start - map->start)); + len = min(len, map->chunk_len - (logical_block_start - map->start)); + btrfs_free_chunk_map(map); + map = NULL; bg = btrfs_lookup_block_group(fs_info, logical_block_start); if (!bg) { @@ -10786,6 +10827,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); + if (!IS_ERR_OR_NULL(map)) + btrfs_free_chunk_map(map); unlock_extent(io_tree, 0, isize - 1, &cached_state); |