From a8dd0cfa37792863b6c4bf9542975212a6715d49 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 11 Feb 2022 16:32:26 -0800 Subject: fs/proc: task_mmu.c: don't read mapcount for migration entry commit 24d7275ce2791829953ed4e72f68277ceb2571c6 upstream. The syzbot reported the below BUG: kernel BUG at include/linux/page-flags.h:785! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline] RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744 Call Trace: page_mapcount include/linux/mm.h:837 [inline] smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466 smaps_pte_entry fs/proc/task_mmu.c:538 [inline] smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601 walk_pmd_range mm/pagewalk.c:128 [inline] walk_pud_range mm/pagewalk.c:205 [inline] walk_p4d_range mm/pagewalk.c:240 [inline] walk_pgd_range mm/pagewalk.c:277 [inline] __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379 walk_page_vma+0x277/0x350 mm/pagewalk.c:530 smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768 smap_gather_stats fs/proc/task_mmu.c:741 [inline] show_smap+0xc6/0x440 fs/proc/task_mmu.c:822 seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272 seq_read+0x3e0/0x5b0 fs/seq_file.c:162 vfs_read+0x1b5/0x600 fs/read_write.c:479 ksys_read+0x12d/0x250 fs/read_write.c:619 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The reproducer was trying to read /proc/$PID/smaps when calling MADV_FREE at the mean time. MADV_FREE may split THPs if it is called for partial THP. It may trigger the below race: CPU A CPU B ----- ----- smaps walk: MADV_FREE: page_mapcount() PageCompound() split_huge_page() page = compound_head(page) PageDoubleMap(page) When calling PageDoubleMap() this page is not a tail page of THP anymore so the BUG is triggered. This could be fixed by elevated refcount of the page before calling mapcount, but that would prevent it from counting migration entries, and it seems overkilling because the race just could happen when PMD is split so all PTE entries of tail pages are actually migration entries, and smaps_account() does treat migration entries as mapcount == 1 as Kirill pointed out. Add a new parameter for smaps_account() to tell this entry is migration entry then skip calling page_mapcount(). Don't skip getting mapcount for device private entries since they do track references with mapcount. Pagemap also has the similar issue although it was not reported. Fixed it as well. [shy828301@gmail.com: v4] Link: https://lkml.kernel.org/r/20220203182641.824731-1-shy828301@gmail.com [nathan@kernel.org: avoid unused variable warning in pagemap_pmd_range()] Link: https://lkml.kernel.org/r/20220207171049.1102239-1-nathan@kernel.org Link: https://lkml.kernel.org/r/20220120202805.3369-1-shy828301@gmail.com Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()") Signed-off-by: Yang Shi Signed-off-by: Nathan Chancellor Reported-by: syzbot+1f52b3a18d5633fa7f82@syzkaller.appspotmail.com Acked-by: David Hildenbrand Cc: "Kirill A. Shutemov" Cc: Jann Horn Cc: Matthew Wilcox Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cf25be3e0321..958fce7aee63 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -430,7 +430,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, } static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty, bool locked) + bool compound, bool young, bool dirty, bool locked, + bool migration) { int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -457,8 +458,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * page_count(page) == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate * page_count(). + * + * The page_mapcount() is called to get a snapshot of the mapcount. + * Without holding the page lock this snapshot can be slightly wrong as + * we cannot always read the mapcount atomically. It is not safe to + * call page_mapcount() even with PTL held if the page is not mapped, + * especially for migration entries. Treat regular migration entries + * as mapcount == 1. */ - if (page_count(page) == 1) { + if ((page_count(page) == 1) || migration) { smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, locked, true); return; @@ -495,6 +503,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); @@ -514,8 +523,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) + } else if (is_pfn_swap_entry(swpent)) { + if (is_migration_entry(swpent)) + migration = true; page = pfn_swap_entry_to_page(swpent); + } } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { page = xa_load(&vma->vm_file->f_mapping->i_pages, @@ -528,7 +540,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), + locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -539,6 +552,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pmd_present(*pmd)) { /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -546,8 +560,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - if (is_migration_entry(entry)) + if (is_migration_entry(entry)) { + migration = true; page = pfn_swap_entry_to_page(entry); + } } if (IS_ERR_OR_NULL(page)) return; @@ -559,7 +575,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); + + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), + locked, migration); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -1363,6 +1381,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, { u64 frame = 0, flags = 0; struct page *page = NULL; + bool migration = false; if (pte_present(pte)) { if (pm->show_pfn) @@ -1384,13 +1403,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; + migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); } if (page && !PageAnon(page)) flags |= PM_FILE; - if (page && page_mapcount(page) == 1) + if (page && !migration && page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1406,8 +1426,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool migration = false; + ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { u64 flags = 0, frame = 0; @@ -1446,11 +1467,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); + migration = is_migration_entry(entry); page = pfn_swap_entry_to_page(entry); } #endif - if (page && page_mapcount(page) == 1) + if (page && !migration && page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; for (; addr != end; addr += PAGE_SIZE) { -- cgit v1.2.3 From ae6ca6343929a00ef95c7c8d1bea47d651785edc Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 11 Nov 2021 14:14:38 +0900 Subject: btrfs: zoned: cache reported zone during mount commit 16beac87e95e2fb278b552397c8260637f8a63f7 upstream. When mounting a device, we are reporting the zones twice: once for checking the zone attributes in btrfs_get_dev_zone_info and once for loading block groups' zone info in btrfs_load_block_group_zone_info(). With a lot of block groups, that leads to a lot of REPORT ZONE commands and slows down the mount process. This patch introduces a zone info cache in struct btrfs_zoned_device_info. The cache is populated while in btrfs_get_dev_zone_info() and used for btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE commands. The zone cache is then released after loading the block groups, as it will not be much effective during the run time. Benchmark: Mount an HDD with 57,007 block groups Before patch: 171.368 seconds After patch: 64.064 seconds While it still takes a minute due to the slowness of loading all the block groups, the patch reduces the mount time by 1/3. Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/ Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices") CC: stable@vger.kernel.org Signed-off-by: Naohiro Aota Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/disk-io.c | 2 ++ fs/btrfs/volumes.c | 2 +- fs/btrfs/zoned.c | 85 +++++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/zoned.h | 8 +++-- 5 files changed, 87 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d029be40ea6f..bdbc310a8f8c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -325,7 +325,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e1a262120e02..2c3e106a0270 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3565,6 +3565,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } + btrfs_free_zone_cache(fs_info); + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c34efdc1ecdd..06a1a7c2254c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2596,7 +2596,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->fs_info = fs_info; device->bdev = bdev; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error_free_device; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5672c24a2d58..596b2148807d 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "ctree.h" #include "volumes.h" #include "zoned.h" @@ -195,6 +196,8 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos, static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { + struct btrfs_zoned_device_info *zinfo = device->zone_info; + u32 zno; int ret; if (!*nr_zones) @@ -206,6 +209,34 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } + /* Check cache */ + if (zinfo->zone_cache) { + unsigned int i; + + ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + zno = pos >> zinfo->zone_size_shift; + /* + * We cannot report zones beyond the zone end. So, it is OK to + * cap *nr_zones to at the end. + */ + *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); + + for (i = 0; i < *nr_zones; i++) { + struct blk_zone *zone_info; + + zone_info = &zinfo->zone_cache[zno + i]; + if (!zone_info->len) + break; + } + + if (i == *nr_zones) { + /* Cache hit on all the zones */ + memcpy(zones, zinfo->zone_cache + zno, + sizeof(*zinfo->zone_cache) * *nr_zones); + return 0; + } + } + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { @@ -219,6 +250,11 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, if (!ret) return -EIO; + /* Populate cache */ + if (zinfo->zone_cache) + memcpy(zinfo->zone_cache + zno, zones, + sizeof(*zinfo->zone_cache) * *nr_zones); + return 0; } @@ -282,7 +318,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) if (!device->bdev) continue; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, true); if (ret) break; } @@ -291,7 +327,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) return ret; } -int btrfs_get_dev_zone_info(struct btrfs_device *device) +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; @@ -318,6 +354,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!zone_info) return -ENOMEM; + device->zone_info = zone_info; + if (!bdev_is_zoned(bdev)) { if (!fs_info->zone_size) { ret = calculate_emulated_zone_size(fs_info); @@ -369,6 +407,23 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + /* + * Enable zone cache only for a zoned device. On a non-zoned device, we + * fill the zone info with emulated CONVENTIONAL zones, so no need to + * use the cache. + */ + if (populate_cache && bdev_is_zoned(device->bdev)) { + zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * + zone_info->nr_zones); + if (!zone_info->zone_cache) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to allocate zone cache for %s", + rcu_str_deref(device->name)); + ret = -ENOMEM; + goto out; + } + } + /* Get zones type */ while (sector < nr_sectors) { nr_zones = BTRFS_REPORT_NR_ZONES; @@ -444,8 +499,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) kfree(zones); - device->zone_info = zone_info; - switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: model = "host-managed zoned"; @@ -478,10 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) out: kfree(zones); out_free_zone_info: - bitmap_free(zone_info->empty_zones); - bitmap_free(zone_info->seq_zones); - kfree(zone_info); - device->zone_info = NULL; + btrfs_destroy_dev_zone_info(device); return ret; } @@ -495,6 +545,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); + vfree(zone_info->zone_cache); kfree(zone_info); device->zone_info = NULL; } @@ -1551,3 +1602,21 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) fs_info->data_reloc_bg = 0; spin_unlock(&fs_info->relocation_bg_lock); } + +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + if (!btrfs_is_zoned(fs_info)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->zone_info) { + vfree(device->zone_info->zone_cache); + device->zone_info->zone_cache = NULL; + } + } + mutex_unlock(&fs_devices->device_list_mutex); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 70b3be517599..813aa3cddc11 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -25,6 +25,7 @@ struct btrfs_zoned_device_info { u32 nr_zones; unsigned long *seq_zones; unsigned long *empty_zones; + struct blk_zone *zone_cache; struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; @@ -32,7 +33,7 @@ struct btrfs_zoned_device_info { int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); -int btrfs_get_dev_zone_info(struct btrfs_device *device); +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); @@ -67,6 +68,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -79,7 +81,8 @@ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_i return 0; } -static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) +static inline int btrfs_get_dev_zone_info(struct btrfs_device *device, + bool populate_cache) { return 0; } @@ -202,6 +205,7 @@ static inline struct btrfs_device *btrfs_zoned_get_device( static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } +static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3 From 7e234c47fd2f3c8b342c96cbad8fc5ac45199ec9 Mon Sep 17 00:00:00 2001 From: Dāvis Mosāns Date: Sat, 5 Feb 2022 20:48:23 +0200 Subject: btrfs: send: in case of IO error log it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2e7be9db125a0bf940c5d65eb5c40d8700f738b5 upstream. Currently if we get IO error while doing send then we abort without logging information about which file caused issue. So log it to help with debugging. CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Dāvis Mosāns Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/send.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 72f9b865e847..5612e8bf2ace 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4978,6 +4978,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) lock_page(page); if (!PageUptodate(page)) { unlock_page(page); + btrfs_err(fs_info, + "send: IO error at offset %llu for inode %llu root %llu", + page_offset(page), sctx->cur_ino, + sctx->send_root->root_key.objectid); put_page(page); ret = -EIO; break; -- cgit v1.2.3 From 38f22c730c382b90ee20b5a0a6082dbf2643acf4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:16 -0800 Subject: vfs: make freeze_super abort when sync_filesystem returns error [ Upstream commit 2719c7160dcfaae1f73a1c0c210ad3281c19022e ] If we fail to synchronize the filesystem while preparing to freeze the fs, abort the freeze. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/super.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index a1f82dfd1b39..87379bb1f7a3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1616,11 +1616,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +static void sb_freeze_unlock(struct super_block *sb, int level) { - int level; - - for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } @@ -1691,7 +1689,14 @@ int freeze_super(struct super_block *sb) sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ - sync_filesystem(sb); + ret = sync_filesystem(sb); + if (ret) { + sb->s_writers.frozen = SB_UNFROZEN; + sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up(&sb->s_writers.wait_unfrozen); + deactivate_locked_super(sb); + return ret; + } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; @@ -1703,7 +1708,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1748,7 +1753,7 @@ static int thaw_super_locked(struct super_block *sb) } sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); -- cgit v1.2.3 From 64d6f76958c5b1ce559d29c60a9494363922a576 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:16 -0800 Subject: quota: make dquot_quota_sync return errors from ->sync_fs [ Upstream commit dd5532a4994bfda0386eb2286ec00758cee08444 ] Strangely, dquot_quota_sync ignores the return code from the ->sync_fs call, which means that quotacalls like Q_SYNC never see the error. This doesn't seem right, so fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/quota/dquot.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 22d904bde6ab..a74aef99bd3d 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -690,9 +690,14 @@ int dquot_quota_sync(struct super_block *sb, int type) /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } + ret = sync_blockdev(sb->s_bdev); + if (ret) + return ret; /* * Now when everything is written we can discard the pagecache so -- cgit v1.2.3 From 68b8924f6038bff902fd5cc63de45f5b8a8ac228 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 3 Jan 2022 16:50:25 +0200 Subject: cifs: fix set of group SID via NTSD xattrs commit dd5a927e411836eaef44eb9b00fece615e82e242 upstream. 'setcifsacl -g ' silently fails to set the group SID on server. Actually, the bug existed since commit 438471b67963 ("CIFS: Add support for setting owner info, dos attributes, and create time"), but this fix will not apply cleanly to kernel versions <= v5.10. Fixes: 3970acf7ddb9 ("SMB3: Add support for getting and setting SACLs") Cc: stable@vger.kernel.org # 5.11+ Signed-off-by: Amir Goldstein Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/xattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 7d8b72d67c80..9d486fbbfbbd 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -175,11 +175,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler, switch (handler->flags) { case XATTR_CIFS_NTSD_FULL: aclflags = (CIFS_ACL_OWNER | + CIFS_ACL_GROUP | CIFS_ACL_DACL | CIFS_ACL_SACL); break; case XATTR_CIFS_NTSD: aclflags = (CIFS_ACL_OWNER | + CIFS_ACL_GROUP | CIFS_ACL_DACL); break; case XATTR_CIFS_ACL: -- cgit v1.2.3 From 727dd33561d587d257250ffc0004aea8924a1c7d Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 12 Feb 2022 01:54:14 -0600 Subject: smb3: fix snapshot mount option commit 9405b5f8b20c2bfa6523a555279a0379640dc136 upstream. The conversion to the new API broke the snapshot mount option due to 32 vs. 64 bit type mismatch Fixes: 24e0a1eff9e2 ("cifs: switch to new mount api") Cc: stable@vger.kernel.org # 5.11+ Reported-by: Acked-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/fs_context.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 0a2542286552..3b8ed36b3711 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -146,7 +146,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), fsparam_u32("handletimeout", Opt_handletimeout), - fsparam_u32("snapshot", Opt_snapshot), + fsparam_u64("snapshot", Opt_snapshot), fsparam_u32("max_channels", Opt_max_channels), /* Mount options which take string value */ @@ -1062,7 +1062,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->echo_interval = result.uint_32; break; case Opt_snapshot: - ctx->snapshot_time = result.uint_32; + ctx->snapshot_time = result.uint_64; break; case Opt_max_credits: if (result.uint_32 < 20 || result.uint_32 > 60000) { -- cgit v1.2.3 From f2238b4e83b60c243c35bc352161c8ce7e790725 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Feb 2022 12:14:44 -0500 Subject: NFS: Remove an incorrect revalidation in nfs4_update_changeattr_locked() commit 9d047bf68fe8cdb4086deaf4edd119731a9481ed upstream. In nfs4_update_changeattr_locked(), we don't need to set the NFS_INO_REVAL_PAGECACHE flag, because we already know the value of the change attribute, and we're already flagging the size. In fact, this forces us to revalidate the change attribute a second time for no good reason. This extra flag appears to have been introduced as part of the xattr feature, when update_changeattr_locked() was converted for use by the xattr code. Fixes: 1b523ca972ed ("nfs: modify update_changeattr to deal with regular files") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 389fa72d4ca9..53be03681f69 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1232,8 +1232,7 @@ nfs4_update_changeattr_locked(struct inode *inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | - NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR | - NFS_INO_REVAL_PAGECACHE; + NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR; nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); } nfsi->attrtimeo_timestamp = jiffies; -- cgit v1.2.3 From f1322f10f60c4461b8f3beb2eb36358bb9e74ff9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Feb 2022 13:38:23 -0500 Subject: NFS: LOOKUP_DIRECTORY is also ok with symlinks commit e0caaf75d443e02e55e146fd75fe2efc8aed5540 upstream. Commit ac795161c936 (NFSv4: Handle case where the lookup of a directory fails) [1], part of Linux since 5.17-rc2, introduced a regression, where a symbolic link on an NFS mount to a directory on another NFS does not resolve(?) the first time it is accessed: Reported-by: Paul Menzel Fixes: ac795161c936 ("NFSv4: Handle case where the lookup of a directory fails") Signed-off-by: Trond Myklebust Tested-by: Donald Buczek Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f6381c675cbe..9adc6f57a008 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1987,14 +1987,14 @@ no_open: if (!res) { inode = d_inode(dentry); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && - !S_ISDIR(inode->i_mode)) + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) res = ERR_PTR(-ENOTDIR); else if (inode && S_ISREG(inode->i_mode)) res = ERR_PTR(-EOPENSTALE); } else if (!IS_ERR(res)) { inode = d_inode(res); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && - !S_ISDIR(inode->i_mode)) { + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) { dput(res); res = ERR_PTR(-ENOTDIR); } else if (inode && S_ISREG(inode->i_mode)) { -- cgit v1.2.3 From 21165833efa6bed0ad449baf69877e56e106911f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Feb 2022 18:05:18 -0500 Subject: NFS: Do not report writeback errors in nfs_getattr() commit d19e0183a88306acda07f4a01fedeeffe2a2a06b upstream. The result of the writeback, whether it is an ENOSPC or an EIO, or anything else, does not inhibit the NFS client from reporting the correct file timestamps. Fixes: 79566ef018f5 ("NFS: Getattr doesn't require data sync semantics") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/inode.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f9d3ad3acf11..410f87bc48cc 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -840,12 +840,9 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, } /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME|STATX_MTIME)) && - S_ISREG(inode->i_mode)) { - err = filemap_write_and_wait(inode->i_mapping); - if (err) - goto out; - } + if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + S_ISREG(inode->i_mode)) + filemap_write_and_wait(inode->i_mapping); /* * We may force a getattr if the user cares about atime. -- cgit v1.2.3 From 03dd71e0b9d8e9dac462f51aad69455c5a7a25da Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Jan 2022 18:28:56 +0900 Subject: ksmbd: fix same UniqueId for dot and dotdot entries [ Upstream commit 97550c7478a2da93e348d8c3075d92cddd473a78 ] ksmbd sets the inode number to UniqueId. However, the same UniqueId for dot and dotdot entry is set to the inode number of the parent inode. This patch set them using the current inode and parent inode. Signed-off-by: Namjae Jeon Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/ksmbd/smb_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 707490ab1f4c..f2e7e3a654b3 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -308,14 +308,17 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, for (i = 0; i < 2; i++) { struct kstat kstat; struct ksmbd_kstat ksmbd_kstat; + struct dentry *dentry; if (!dir->dot_dotdot[i]) { /* fill dot entry info */ if (i == 0) { d_info->name = "."; d_info->name_len = 1; + dentry = dir->filp->f_path.dentry; } else { d_info->name = ".."; d_info->name_len = 2; + dentry = dir->filp->f_path.dentry->d_parent; } if (!match_pattern(d_info->name, d_info->name_len, @@ -327,7 +330,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, ksmbd_kstat.kstat = &kstat; ksmbd_vfs_fill_dentry_attrs(work, user_ns, - dir->filp->f_path.dentry->d_parent, + dentry, &ksmbd_kstat); rc = fn(conn, info_level, d_info, &ksmbd_kstat); if (rc) -- cgit v1.2.3 From 5644bf688e4a6042e57288cbc28cc083febb8f99 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Jan 2022 18:31:01 +0900 Subject: ksmbd: don't align last entry offset in smb2 query directory [ Upstream commit 04e260948a160d3b7d622bf4c8a96fa4577c09bd ] When checking smb2 query directory packets from other servers, OutputBufferLength is different with ksmbd. Other servers add an unaligned next offset to OutputBufferLength for the last entry. Signed-off-by: Namjae Jeon Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/ksmbd/smb2pdu.c | 7 ++++--- fs/ksmbd/vfs.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 70685cbbec8c..192d8308afc2 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3422,9 +3422,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, goto free_conv_name; } - struct_sz = readdir_info_level_struct_sz(info_level); - next_entry_offset = ALIGN(struct_sz - 1 + conv_len, - KSMBD_DIR_INFO_ALIGNMENT); + struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len; + next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT); + d_info->last_entry_off_align = next_entry_offset - struct_sz; if (next_entry_offset > d_info->out_buf_len) { d_info->out_buf_len = 0; @@ -3976,6 +3976,7 @@ int smb2_query_dir(struct ksmbd_work *work) ((struct file_directory_info *) ((char *)rsp->Buffer + d_info.last_entry_offset)) ->NextEntryOffset = 0; + d_info.data_count -= d_info.last_entry_off_align; rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index b0d5b8feb4a3..432c94773177 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -86,6 +86,7 @@ struct ksmbd_dir_info { int last_entry_offset; bool hide_dot_file; int flags; + int last_entry_off_align; }; struct ksmbd_readdir_data { -- cgit v1.2.3 From 246dfbc12539808b024b4e53864fc494bde679b9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 24 Jan 2022 21:17:36 -0800 Subject: mm: io_uring: allow oom-killer from io_uring_setup [ Upstream commit 0a3f1e0beacf6cc8ae5f846b0641c1df476e83d6 ] On an overcommitted system which is running multiple workloads of varying priorities, it is preferred to trigger an oom-killer to kill a low priority workload than to let the high priority workload receiving ENOMEMs. On our memory overcommitted systems, we are seeing a lot of ENOMEMs instead of oom-kills because io_uring_setup callchain is using __GFP_NORETRY gfp flag which avoids the oom-killer. Let's remove it and allow the oom-killer to kill a lower priority job. Signed-off-by: Shakeel Butt Link: https://lore.kernel.org/r/20220125051736.2981459-1-shakeelb@google.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 993913c585fb..21fc8ce9405d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8820,10 +8820,9 @@ static void io_mem_free(void *ptr) static void *io_mem_alloc(size_t size) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | - __GFP_NORETRY | __GFP_ACCOUNT; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - return (void *) __get_free_pages(gfp_flags, get_order(size)); + return (void *) __get_free_pages(gfp, get_order(size)); } static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, -- cgit v1.2.3 From 7e80846a99271f8a643404495288d1bac2894c5b Mon Sep 17 00:00:00 2001 From: Su Yue Date: Fri, 21 Jan 2022 17:33:34 +0800 Subject: btrfs: tree-checker: check item_size for inode_item commit 0c982944af27d131d3b74242f3528169f66950ad upstream. while mounting the crafted image, out-of-bounds access happens: [350.429619] UBSAN: array-index-out-of-bounds in fs/btrfs/struct-funcs.c:161:1 [350.429636] index 1048096 is out of range for type 'page *[16]' [350.429650] CPU: 0 PID: 9 Comm: kworker/u8:1 Not tainted 5.16.0-rc4 #1 [350.429652] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [350.429653] Workqueue: btrfs-endio-meta btrfs_work_helper [btrfs] [350.429772] Call Trace: [350.429774] [350.429776] dump_stack_lvl+0x47/0x5c [350.429780] ubsan_epilogue+0x5/0x50 [350.429786] __ubsan_handle_out_of_bounds+0x66/0x70 [350.429791] btrfs_get_16+0xfd/0x120 [btrfs] [350.429832] check_leaf+0x754/0x1a40 [btrfs] [350.429874] ? filemap_read+0x34a/0x390 [350.429878] ? load_balance+0x175/0xfc0 [350.429881] validate_extent_buffer+0x244/0x310 [btrfs] [350.429911] btrfs_validate_metadata_buffer+0xf8/0x100 [btrfs] [350.429935] end_bio_extent_readpage+0x3af/0x850 [btrfs] [350.429969] ? newidle_balance+0x259/0x480 [350.429972] end_workqueue_fn+0x29/0x40 [btrfs] [350.429995] btrfs_work_helper+0x71/0x330 [btrfs] [350.430030] ? __schedule+0x2fb/0xa40 [350.430033] process_one_work+0x1f6/0x400 [350.430035] ? process_one_work+0x400/0x400 [350.430036] worker_thread+0x2d/0x3d0 [350.430037] ? process_one_work+0x400/0x400 [350.430038] kthread+0x165/0x190 [350.430041] ? set_kthread_struct+0x40/0x40 [350.430043] ret_from_fork+0x1f/0x30 [350.430047] [350.430077] BTRFS warning (device loop0): bad eb member start: ptr 0xffe20f4e start 20975616 member offset 4293005178 size 2 check_leaf() is checking the leaf: corrupt leaf: root=4 block=29396992 slot=1, bad key order, prev (16140901064495857664 1 0) current (1 204 12582912) leaf 29396992 items 6 free space 3565 generation 6 owner DEV_TREE leaf 29396992 flags 0x1(WRITTEN) backref revision 1 fs uuid a62e00e8-e94e-4200-8217-12444de93c2e chunk uuid cecbd0f7-9ca0-441e-ae9f-f782f9732bd8 item 0 key (16140901064495857664 INODE_ITEM 0) itemoff 3955 itemsize 40 generation 0 transid 0 size 0 nbytes 17592186044416 block group 0 mode 52667 links 33 uid 0 gid 2104132511 rdev 94223634821136 sequence 100305 flags 0x2409000(none) atime 0.0 (1970-01-01 08:00:00) ctime 2973280098083405823.4294967295 (-269783007-01-01 21:37:03) mtime 18446744071572723616.4026825121 (1902-04-16 12:40:00) otime 9249929404488876031.4294967295 (622322949-04-16 04:25:58) item 1 key (1 DEV_EXTENT 12582912) itemoff 3907 itemsize 48 dev extent chunk_tree 3 chunk_objectid 256 chunk_offset 12582912 length 8388608 chunk_tree_uuid cecbd0f7-9ca0-441e-ae9f-f782f9732bd8 The corrupted leaf of device tree has an inode item. The leaf passed checksum and others checks in validate_extent_buffer until check_leaf_item(). Because of the key type BTRFS_INODE_ITEM, check_inode_item() is called even we are in the device tree. Since the item offset + sizeof(struct btrfs_inode_item) > eb->len, out-of-bounds access is triggered. The item end vs leaf boundary check has been done before check_leaf_item(), so fix it by checking item size in check_inode_item() before access of the inode item in extent buffer. Other check functions except check_dev_item() in check_leaf_item() have their item size checks. The commit for check_dev_item() is followed. No regression observed during running fstests. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215299 CC: stable@vger.kernel.org # 5.10+ CC: Wenqing Liu Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7733e8ac0a69..692d372a54cd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1007,6 +1007,7 @@ static int check_inode_item(struct extent_buffer *leaf, struct btrfs_inode_item *iitem; u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); + const u32 item_size = btrfs_item_size_nr(leaf, slot); u32 mode; int ret; u32 flags; @@ -1016,6 +1017,12 @@ static int check_inode_item(struct extent_buffer *leaf, if (unlikely(ret < 0)) return ret; + if (unlikely(item_size != sizeof(*iitem))) { + generic_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*iitem)); + return -EUCLEAN; + } + iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ -- cgit v1.2.3 From b80fbc20f33489dd810edfc52343e3c0f4b2403a Mon Sep 17 00:00:00 2001 From: Su Yue Date: Fri, 21 Jan 2022 17:33:35 +0800 Subject: btrfs: tree-checker: check item_size for dev_item commit ea1d1ca4025ac6c075709f549f9aa036b5b6597d upstream. Check item size before accessing the device item to avoid out of bound access, similar to inode_item check. Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 692d372a54cd..51382d2be3d4 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -965,6 +965,7 @@ static int check_dev_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_dev_item *ditem; + const u32 item_size = btrfs_item_size_nr(leaf, slot); if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { dev_item_err(leaf, slot, @@ -972,6 +973,13 @@ static int check_dev_item(struct extent_buffer *leaf, key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } + + if (unlikely(item_size != sizeof(*ditem))) { + dev_item_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*ditem)); + return -EUCLEAN; + } + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { dev_item_err(leaf, slot, -- cgit v1.2.3 From 7c83437fb3aeda489638304edf7c386b16e7cc5b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Feb 2022 05:49:30 -0700 Subject: io_uring: don't convert to jiffies for waiting on timeouts commit 228339662b398a59b3560cd571deb8b25b253c7e upstream. If an application calls io_uring_enter(2) with a timespec passed in, convert that timespec to ktime_t rather than jiffies. The latter does not provide the granularity the application may expect, and may in fact provided different granularity on different systems, depending on what the HZ value is configured at. Turn the timespec into an absolute ktime_t, and use that with schedule_hrtimeout() instead. Link: https://github.com/axboe/liburing/issues/531 Cc: stable@vger.kernel.org Reported-by: Bob Chen Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 21fc8ce9405d..9ba4c12cb335 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7590,7 +7590,7 @@ static int io_run_task_work_sig(void) /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - signed long *timeout) + ktime_t timeout) { int ret; @@ -7602,8 +7602,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (test_bit(0, &ctx->check_cq_overflow)) return 1; - *timeout = schedule_timeout(*timeout); - return !*timeout ? -ETIME : 1; + if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) + return -ETIME; + return 1; } /* @@ -7616,7 +7617,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; - signed long timeout = MAX_SCHEDULE_TIMEOUT; + ktime_t timeout = KTIME_MAX; int ret; do { @@ -7632,7 +7633,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (get_timespec64(&ts, uts)) return -EFAULT; - timeout = timespec64_to_jiffies(&ts); + timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } if (sig) { @@ -7664,7 +7665,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); + ret = io_cqring_wait_schedule(ctx, &iowq, timeout); finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); -- cgit v1.2.3 From 0d773aaf5a90e2856301bd30781bf107db917a66 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 22 Feb 2022 08:17:51 -0800 Subject: io_uring: disallow modification of rsrc_data during quiesce commit 80912cef18f16f8fe59d1fb9548d4364342be360 upstream. io_rsrc_ref_quiesce will unlock the uring while it waits for references to the io_rsrc_data to be killed. There are other places to the data that might add references to data via calls to io_rsrc_node_switch. There is a race condition where this reference can be added after the completion has been signalled. At this point the io_rsrc_ref_quiesce call will wake up and relock the uring, assuming the data is unused and can be freed - although it is actually being used. To fix this check in io_rsrc_ref_quiesce if a resource has been revived. Reported-by: syzbot+ca8bf833622a1662745b@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220222161751.995746-1-dylany@fb.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9ba4c12cb335..6558f2c2eb19 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7818,7 +7818,15 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct ret = wait_for_completion_interruptible(&data->done); if (!ret) { mutex_lock(&ctx->uring_lock); - break; + if (atomic_read(&data->refs) > 0) { + /* + * it has been revived by another thread while + * we were unlocked + */ + mutex_unlock(&ctx->uring_lock); + } else { + break; + } } atomic_inc(&data->refs); -- cgit v1.2.3 From c718ea4e7382e18957ed0e88a5f855e2122d9c00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Feb 2022 20:10:03 -0800 Subject: io_uring: add a schedule point in io_add_buffers() commit f240762f88b4b1b58561939ffd44837759756477 upstream. Looping ~65535 times doing kmalloc() calls can trigger soft lockups, especially with DEBUG features (like KASAN). [ 253.536212] watchdog: BUG: soft lockup - CPU#64 stuck for 26s! [b219417889:12575] [ 253.544433] Modules linked in: vfat fat i2c_mux_pca954x i2c_mux spidev cdc_acm xhci_pci xhci_hcd sha3_generic gq(O) [ 253.544451] CPU: 64 PID: 12575 Comm: b219417889 Tainted: G S O 5.17.0-smp-DEV #801 [ 253.544457] RIP: 0010:kernel_text_address (./include/asm-generic/sections.h:192 ./include/linux/kallsyms.h:29 kernel/extable.c:67 kernel/extable.c:98) [ 253.544464] Code: 0f 93 c0 48 c7 c1 e0 63 d7 a4 48 39 cb 0f 92 c1 20 c1 0f b6 c1 5b 5d c3 90 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 53 48 89 fb <48> c7 c0 00 00 80 a0 41 be 01 00 00 00 48 39 c7 72 0c 48 c7 c0 40 [ 253.544468] RSP: 0018:ffff8882d8baf4c0 EFLAGS: 00000246 [ 253.544471] RAX: 1ffff1105b175e00 RBX: ffffffffa13ef09a RCX: 00000000a13ef001 [ 253.544474] RDX: ffffffffa13ef09a RSI: ffff8882d8baf558 RDI: ffffffffa13ef09a [ 253.544476] RBP: ffff8882d8baf4d8 R08: ffff8882d8baf5e0 R09: 0000000000000004 [ 253.544479] R10: ffff8882d8baf5e8 R11: ffffffffa0d59a50 R12: ffff8882eab20380 [ 253.544481] R13: ffffffffa0d59a50 R14: dffffc0000000000 R15: 1ffff1105b175eb0 [ 253.544483] FS: 00000000016d3380(0000) GS:ffff88af48c00000(0000) knlGS:0000000000000000 [ 253.544486] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 253.544488] CR2: 00000000004af0f0 CR3: 00000002eabfa004 CR4: 00000000003706e0 [ 253.544491] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 253.544492] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 253.544494] Call Trace: [ 253.544496] [ 253.544498] ? io_queue_sqe (fs/io_uring.c:7143) [ 253.544505] __kernel_text_address (kernel/extable.c:78) [ 253.544508] unwind_get_return_address (arch/x86/kernel/unwind_frame.c:19) [ 253.544514] arch_stack_walk (arch/x86/kernel/stacktrace.c:27) [ 253.544517] ? io_queue_sqe (fs/io_uring.c:7143) [ 253.544521] stack_trace_save (kernel/stacktrace.c:123) [ 253.544527] ____kasan_kmalloc (mm/kasan/common.c:39 mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:515) [ 253.544531] ? ____kasan_kmalloc (mm/kasan/common.c:39 mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:515) [ 253.544533] ? __kasan_kmalloc (mm/kasan/common.c:524) [ 253.544535] ? kmem_cache_alloc_trace (./include/linux/kasan.h:270 mm/slab.c:3567) [ 253.544541] ? io_issue_sqe (fs/io_uring.c:4556 fs/io_uring.c:4589 fs/io_uring.c:6828) [ 253.544544] ? __io_queue_sqe (fs/io_uring.c:?) [ 253.544551] __kasan_kmalloc (mm/kasan/common.c:524) [ 253.544553] kmem_cache_alloc_trace (./include/linux/kasan.h:270 mm/slab.c:3567) [ 253.544556] ? io_issue_sqe (fs/io_uring.c:4556 fs/io_uring.c:4589 fs/io_uring.c:6828) [ 253.544560] io_issue_sqe (fs/io_uring.c:4556 fs/io_uring.c:4589 fs/io_uring.c:6828) [ 253.544564] ? __kasan_slab_alloc (mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) [ 253.544567] ? __kasan_slab_alloc (mm/kasan/common.c:39 mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) [ 253.544569] ? kmem_cache_alloc_bulk (mm/slab.h:732 mm/slab.c:3546) [ 253.544573] ? __io_alloc_req_refill (fs/io_uring.c:2078) [ 253.544578] ? io_submit_sqes (fs/io_uring.c:7441) [ 253.544581] ? __se_sys_io_uring_enter (fs/io_uring.c:10154 fs/io_uring.c:10096) [ 253.544584] ? __x64_sys_io_uring_enter (fs/io_uring.c:10096) [ 253.544587] ? do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 253.544590] ? entry_SYSCALL_64_after_hwframe (??:?) [ 253.544596] __io_queue_sqe (fs/io_uring.c:?) [ 253.544600] io_queue_sqe (fs/io_uring.c:7143) [ 253.544603] io_submit_sqe (fs/io_uring.c:?) [ 253.544608] io_submit_sqes (fs/io_uring.c:?) [ 253.544612] __se_sys_io_uring_enter (fs/io_uring.c:10154 fs/io_uring.c:10096) [ 253.544616] __x64_sys_io_uring_enter (fs/io_uring.c:10096) [ 253.544619] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 253.544623] entry_SYSCALL_64_after_hwframe (??:?) Fixes: ddf0322db79c ("io_uring: add IORING_OP_PROVIDE_BUFFERS") Signed-off-by: Eric Dumazet Cc: Jens Axboe Cc: Pavel Begunkov Cc: io-uring Reported-by: syzbot Link: https://lore.kernel.org/r/20220215041003.2394784-1-eric.dumazet@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6558f2c2eb19..d7e49e87b49b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4454,6 +4454,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) } else { list_add_tail(&buf->list, &(*head)->list); } + cond_resched(); } return i ? i : -ENOMEM; -- cgit v1.2.3 From e7a66dd2687758718eddd79b542a95cf3aa488cc Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Tue, 15 Feb 2022 15:10:30 +0800 Subject: configfs: fix a race in configfs_{,un}register_subsystem() [ Upstream commit 84ec758fb2daa236026506868c8796b0500c047d ] When configfs_register_subsystem() or configfs_unregister_subsystem() is executing link_group() or unlink_group(), it is possible that two processes add or delete list concurrently. Some unfortunate interleavings of them can cause kernel panic. One of cases is: A --> B --> C --> D A <-- B <-- C <-- D delete list_head *B | delete list_head *C --------------------------------|----------------------------------- configfs_unregister_subsystem | configfs_unregister_subsystem unlink_group | unlink_group unlink_obj | unlink_obj list_del_init | list_del_init __list_del_entry | __list_del_entry __list_del | __list_del // next == C | next->prev = prev | | next->prev = prev prev->next = next | | // prev == B | prev->next = next Fix this by adding mutex when calling link_group() or unlink_group(), but parent configfs_subsystem is NULL when config_item is root. So I create a mutex configfs_subsystem_mutex. Fixes: 7063fbf22611 ("[PATCH] configfs: User-driven configuration filesystem") Signed-off-by: ChenXiaoSong Signed-off-by: Laibin Qiu Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- fs/configfs/dir.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index d3cd2a94d1e8..d1f9d2632202 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -34,6 +34,14 @@ */ DEFINE_SPINLOCK(configfs_dirent_lock); +/* + * All of link_obj/unlink_obj/link_group/unlink_group require that + * subsys->su_mutex is held. + * But parent configfs_subsystem is NULL when config_item is root. + * Use this mutex when config_item is root. + */ +static DEFINE_MUTEX(configfs_subsystem_mutex); + static void configfs_d_iput(struct dentry * dentry, struct inode * inode) { @@ -1859,7 +1867,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) group->cg_item.ci_name = group->cg_item.ci_namebuf; sd = root->d_fsdata; + mutex_lock(&configfs_subsystem_mutex); link_group(to_config_group(sd->s_element), group); + mutex_unlock(&configfs_subsystem_mutex); inode_lock_nested(d_inode(root), I_MUTEX_PARENT); @@ -1884,7 +1894,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) inode_unlock(d_inode(root)); if (err) { + mutex_lock(&configfs_subsystem_mutex); unlink_group(group); + mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } put_fragment(frag); @@ -1931,7 +1943,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) dput(dentry); + mutex_lock(&configfs_subsystem_mutex); unlink_group(group); + mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } -- cgit v1.2.3 From 6db927ce66ac68bf732f0b14190791458e75047a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 25 Feb 2022 15:34:26 -0500 Subject: tracefs: Set the group ownership in apply_options() not parse_options() commit 851e99ebeec3f4a672bb5010cf1ece095acee447 upstream. Al Viro brought it to my attention that the dentries may not be filled when the parse_options() is called, causing the call to set_gid() to possibly crash. It should only be called if parse_options() succeeds totally anyway. He suggested the logical place to do the update is in apply_options(). Link: https://lore.kernel.org/all/20220225165219.737025658@goodmis.org/ Link: https://lkml.kernel.org/r/20220225153426.1c4cab6b@gandalf.local.home Cc: stable@vger.kernel.org Acked-by: Al Viro Reported-by: Al Viro Fixes: 48b27b6b5191 ("tracefs: Set all files to the same group ownership as the mount option") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Greg Kroah-Hartman --- fs/tracefs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 3616839c5c4b..f2625a372a3a 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -264,7 +264,6 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) if (!gid_valid(gid)) return -EINVAL; opts->gid = gid; - set_gid(tracefs_mount->mnt_root, gid); break; case Opt_mode: if (match_octal(&args[0], &option)) @@ -291,7 +290,9 @@ static int tracefs_apply_options(struct super_block *sb) inode->i_mode |= opts->mode; inode->i_uid = opts->uid; - inode->i_gid = opts->gid; + + /* Set all the group ids to the mount option */ + set_gid(sb->s_root, opts->gid); return 0; } -- cgit v1.2.3 From 8df508b7a44cd8110c726057cd28e8f8116885eb Mon Sep 17 00:00:00 2001 From: Dāvis Mosāns Date: Wed, 2 Feb 2022 23:44:55 +0200 Subject: btrfs: prevent copying too big compressed lzo segment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 741b23a970a79d5d3a1db2d64fa2c7b375a4febb upstream. Compressed length can be corrupted to be a lot larger than memory we have allocated for buffer. This will cause memcpy in copy_compressed_segment to write outside of allocated memory. This mostly results in stuck read syscall but sometimes when using btrfs send can get #GP kernel: general protection fault, probably for non-canonical address 0x841551d5c1000: 0000 [#1] PREEMPT SMP NOPTI kernel: CPU: 17 PID: 264 Comm: kworker/u256:7 Tainted: P OE 5.17.0-rc2-1 #12 kernel: Workqueue: btrfs-endio btrfs_work_helper [btrfs] kernel: RIP: 0010:lzo_decompress_bio (./include/linux/fortify-string.h:225 fs/btrfs/lzo.c:322 fs/btrfs/lzo.c:394) btrfs Code starting with the faulting instruction =========================================== 0:* 48 8b 06 mov (%rsi),%rax <-- trapping instruction 3: 48 8d 79 08 lea 0x8(%rcx),%rdi 7: 48 83 e7 f8 and $0xfffffffffffffff8,%rdi b: 48 89 01 mov %rax,(%rcx) e: 44 89 f0 mov %r14d,%eax 11: 48 8b 54 06 f8 mov -0x8(%rsi,%rax,1),%rdx kernel: RSP: 0018:ffffb110812efd50 EFLAGS: 00010212 kernel: RAX: 0000000000001000 RBX: 000000009ca264c8 RCX: ffff98996e6d8ff8 kernel: RDX: 0000000000000064 RSI: 000841551d5c1000 RDI: ffffffff9500435d kernel: RBP: ffff989a3be856c0 R08: 0000000000000000 R09: 0000000000000000 kernel: R10: 0000000000000000 R11: 0000000000001000 R12: ffff98996e6d8000 kernel: R13: 0000000000000008 R14: 0000000000001000 R15: 000841551d5c1000 kernel: FS: 0000000000000000(0000) GS:ffff98a09d640000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 00001e9f984d9ea8 CR3: 000000014971a000 CR4: 00000000003506e0 kernel: Call Trace: kernel: kernel: end_compressed_bio_read (fs/btrfs/compression.c:104 fs/btrfs/compression.c:1363 fs/btrfs/compression.c:323) btrfs kernel: end_workqueue_fn (fs/btrfs/disk-io.c:1923) btrfs kernel: btrfs_work_helper (fs/btrfs/async-thread.c:326) btrfs kernel: process_one_work (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:212 ./include/trace/events/workqueue.h:108 kernel/workqueue.c:2312) kernel: worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2455) kernel: ? process_one_work (kernel/workqueue.c:2397) kernel: kthread (kernel/kthread.c:377) kernel: ? kthread_complete_and_exit (kernel/kthread.c:332) kernel: ret_from_fork (arch/x86/entry/entry_64.S:301) kernel: CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Dāvis Mosāns Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/lzo.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 295bbc13ace6..fcd7eb496478 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -363,6 +363,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) kunmap(cur_page); cur_in += LZO_LEN; + if (seg_len > lzo1x_worst_compress(PAGE_SIZE)) { + /* + * seg_len shouldn't be larger than we have allocated + * for workspace->cbuf + */ + btrfs_err(fs_info, "unexpectedly large lzo segment len %u", + seg_len); + ret = -EIO; + goto out; + } + /* Copy the compressed segment payload into workspace */ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); -- cgit v1.2.3 From 850a77c999b81dd2724efd2684068d6f90db8c16 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 2 Feb 2022 15:26:09 +0000 Subject: btrfs: get rid of warning on transaction commit when using flushoncommit [ Upstream commit a0f0cf8341e34e5d2265bfd3a7ad68342da1e2aa ] When using the flushoncommit mount option, during almost every transaction commit we trigger a warning from __writeback_inodes_sb_nr(): $ cat fs/fs-writeback.c: (...) static void __writeback_inodes_sb_nr(struct super_block *sb, ... { (...) WARN_ON(!rwsem_is_locked(&sb->s_umount)); (...) } (...) The trace produced in dmesg looks like the following: [947.473890] WARNING: CPU: 5 PID: 930 at fs/fs-writeback.c:2610 __writeback_inodes_sb_nr+0x7e/0xb3 [947.481623] Modules linked in: nfsd nls_cp437 cifs asn1_decoder cifs_arc4 fscache cifs_md4 ipmi_ssif [947.489571] CPU: 5 PID: 930 Comm: btrfs-transacti Not tainted 95.16.3-srb-asrock-00001-g36437ad63879 #186 [947.497969] RIP: 0010:__writeback_inodes_sb_nr+0x7e/0xb3 [947.502097] Code: 24 10 4c 89 44 24 18 c6 (...) [947.519760] RSP: 0018:ffffc90000777e10 EFLAGS: 00010246 [947.523818] RAX: 0000000000000000 RBX: 0000000000963300 RCX: 0000000000000000 [947.529765] RDX: 0000000000000000 RSI: 000000000000fa51 RDI: ffffc90000777e50 [947.535740] RBP: ffff888101628a90 R08: ffff888100955800 R09: ffff888100956000 [947.541701] R10: 0000000000000002 R11: 0000000000000001 R12: ffff888100963488 [947.547645] R13: ffff888100963000 R14: ffff888112fb7200 R15: ffff888100963460 [947.553621] FS: 0000000000000000(0000) GS:ffff88841fd40000(0000) knlGS:0000000000000000 [947.560537] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [947.565122] CR2: 0000000008be50c4 CR3: 000000000220c000 CR4: 00000000001006e0 [947.571072] Call Trace: [947.572354] [947.573266] btrfs_commit_transaction+0x1f1/0x998 [947.576785] ? start_transaction+0x3ab/0x44e [947.579867] ? schedule_timeout+0x8a/0xdd [947.582716] transaction_kthread+0xe9/0x156 [947.585721] ? btrfs_cleanup_transaction.isra.0+0x407/0x407 [947.590104] kthread+0x131/0x139 [947.592168] ? set_kthread_struct+0x32/0x32 [947.595174] ret_from_fork+0x22/0x30 [947.597561] [947.598553] ---[ end trace 644721052755541c ]--- This is because we started using writeback_inodes_sb() to flush delalloc when committing a transaction (when using -o flushoncommit), in order to avoid deadlocks with filesystem freeze operations. This change was made by commit ce8ea7cc6eb313 ("btrfs: don't call btrfs_start_delalloc_roots in flushoncommit"). After that change we started producing that warning, and every now and then a user reports this since the warning happens too often, it spams dmesg/syslog, and a user is unsure if this reflects any problem that might compromise the filesystem's reliability. We can not just lock the sb->s_umount semaphore before calling writeback_inodes_sb(), because that would at least deadlock with filesystem freezing, since at fs/super.c:freeze_super() sync_filesystem() is called while we are holding that semaphore in write mode, and that can trigger a transaction commit, resulting in a deadlock. It would also trigger the same type of deadlock in the unmount path. Possibly, it could also introduce some other locking dependencies that lockdep would report. To fix this call try_to_writeback_inodes_sb() instead of writeback_inodes_sb(), because that will try to read lock sb->s_umount and then will only call writeback_inodes_sb() if it was able to lock it. This is fine because the cases where it can't read lock sb->s_umount are during a filesystem unmount or during a filesystem freeze - in those cases sb->s_umount is write locked and sync_filesystem() is called, which calls writeback_inodes_sb(). In other words, in all cases where we can't take a read lock on sb->s_umount, writeback is already being triggered elsewhere. An alternative would be to call btrfs_start_delalloc_roots() with a number of pages different from LONG_MAX, for example matching the number of delalloc bytes we currently have, in which case we would end up starting all delalloc with filemap_fdatawrite_wbc() and not with an async flush via filemap_flush() - that is only possible after the rather recent commit e076ab2a2ca70a ("btrfs: shrink delalloc pages instead of full inodes"). However that creates a whole new can of worms due to new lock dependencies, which lockdep complains, like for example: [ 8948.247280] ====================================================== [ 8948.247823] WARNING: possible circular locking dependency detected [ 8948.248353] 5.17.0-rc1-btrfs-next-111 #1 Not tainted [ 8948.248786] ------------------------------------------------------ [ 8948.249320] kworker/u16:18/933570 is trying to acquire lock: [ 8948.249812] ffff9b3de1591690 (sb_internal#2){.+.+}-{0:0}, at: find_free_extent+0x141e/0x1590 [btrfs] [ 8948.250638] but task is already holding lock: [ 8948.251140] ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs] [ 8948.252018] which lock already depends on the new lock. [ 8948.252710] the existing dependency chain (in reverse order) is: [ 8948.253343] -> #2 (&root->delalloc_mutex){+.+.}-{3:3}: [ 8948.253950] __mutex_lock+0x90/0x900 [ 8948.254354] start_delalloc_inodes+0x78/0x400 [btrfs] [ 8948.254859] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs] [ 8948.255408] btrfs_commit_transaction+0x32f/0xc00 [btrfs] [ 8948.255942] btrfs_mksubvol+0x380/0x570 [btrfs] [ 8948.256406] btrfs_mksnapshot+0x81/0xb0 [btrfs] [ 8948.256870] __btrfs_ioctl_snap_create+0x17f/0x190 [btrfs] [ 8948.257413] btrfs_ioctl_snap_create_v2+0xbb/0x140 [btrfs] [ 8948.257961] btrfs_ioctl+0x1196/0x3630 [btrfs] [ 8948.258418] __x64_sys_ioctl+0x83/0xb0 [ 8948.258793] do_syscall_64+0x3b/0xc0 [ 8948.259146] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 8948.259709] -> #1 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}: [ 8948.260330] __mutex_lock+0x90/0x900 [ 8948.260692] btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs] [ 8948.261234] btrfs_commit_transaction+0x32f/0xc00 [btrfs] [ 8948.261766] btrfs_set_free_space_cache_v1_active+0x38/0x60 [btrfs] [ 8948.262379] btrfs_start_pre_rw_mount+0x119/0x180 [btrfs] [ 8948.262909] open_ctree+0x1511/0x171e [btrfs] [ 8948.263359] btrfs_mount_root.cold+0x12/0xde [btrfs] [ 8948.263863] legacy_get_tree+0x30/0x50 [ 8948.264242] vfs_get_tree+0x28/0xc0 [ 8948.264594] vfs_kern_mount.part.0+0x71/0xb0 [ 8948.265017] btrfs_mount+0x11d/0x3a0 [btrfs] [ 8948.265462] legacy_get_tree+0x30/0x50 [ 8948.265851] vfs_get_tree+0x28/0xc0 [ 8948.266203] path_mount+0x2d4/0xbe0 [ 8948.266554] __x64_sys_mount+0x103/0x140 [ 8948.266940] do_syscall_64+0x3b/0xc0 [ 8948.267300] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 8948.267790] -> #0 (sb_internal#2){.+.+}-{0:0}: [ 8948.268322] __lock_acquire+0x12e8/0x2260 [ 8948.268733] lock_acquire+0xd7/0x310 [ 8948.269092] start_transaction+0x44c/0x6e0 [btrfs] [ 8948.269591] find_free_extent+0x141e/0x1590 [btrfs] [ 8948.270087] btrfs_reserve_extent+0x14b/0x280 [btrfs] [ 8948.270588] cow_file_range+0x17e/0x490 [btrfs] [ 8948.271051] btrfs_run_delalloc_range+0x345/0x7a0 [btrfs] [ 8948.271586] writepage_delalloc+0xb5/0x170 [btrfs] [ 8948.272071] __extent_writepage+0x156/0x3c0 [btrfs] [ 8948.272579] extent_write_cache_pages+0x263/0x460 [btrfs] [ 8948.273113] extent_writepages+0x76/0x130 [btrfs] [ 8948.273573] do_writepages+0xd2/0x1c0 [ 8948.273942] filemap_fdatawrite_wbc+0x68/0x90 [ 8948.274371] start_delalloc_inodes+0x17f/0x400 [btrfs] [ 8948.274876] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs] [ 8948.275417] flush_space+0x1f2/0x630 [btrfs] [ 8948.275863] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs] [ 8948.276438] process_one_work+0x252/0x5a0 [ 8948.276829] worker_thread+0x55/0x3b0 [ 8948.277189] kthread+0xf2/0x120 [ 8948.277506] ret_from_fork+0x22/0x30 [ 8948.277868] other info that might help us debug this: [ 8948.278548] Chain exists of: sb_internal#2 --> &fs_info->delalloc_root_mutex --> &root->delalloc_mutex [ 8948.279601] Possible unsafe locking scenario: [ 8948.280102] CPU0 CPU1 [ 8948.280508] ---- ---- [ 8948.280915] lock(&root->delalloc_mutex); [ 8948.281271] lock(&fs_info->delalloc_root_mutex); [ 8948.281915] lock(&root->delalloc_mutex); [ 8948.282487] lock(sb_internal#2); [ 8948.282800] *** DEADLOCK *** [ 8948.283333] 4 locks held by kworker/u16:18/933570: [ 8948.283750] #0: ffff9b3dc00a9d48 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0 [ 8948.284609] #1: ffffa90349dafe70 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0 [ 8948.285637] #2: ffff9b3e14db5040 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}, at: btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs] [ 8948.286674] #3: ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs] [ 8948.287596] stack backtrace: [ 8948.287975] CPU: 3 PID: 933570 Comm: kworker/u16:18 Not tainted 5.17.0-rc1-btrfs-next-111 #1 [ 8948.288677] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 8948.289649] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs] [ 8948.290298] Call Trace: [ 8948.290517] [ 8948.290700] dump_stack_lvl+0x59/0x73 [ 8948.291026] check_noncircular+0xf3/0x110 [ 8948.291375] ? start_transaction+0x228/0x6e0 [btrfs] [ 8948.291826] __lock_acquire+0x12e8/0x2260 [ 8948.292241] lock_acquire+0xd7/0x310 [ 8948.292714] ? find_free_extent+0x141e/0x1590 [btrfs] [ 8948.293241] ? lock_is_held_type+0xea/0x140 [ 8948.293601] start_transaction+0x44c/0x6e0 [btrfs] [ 8948.294055] ? find_free_extent+0x141e/0x1590 [btrfs] [ 8948.294518] find_free_extent+0x141e/0x1590 [btrfs] [ 8948.294957] ? _raw_spin_unlock+0x29/0x40 [ 8948.295312] ? btrfs_get_alloc_profile+0x124/0x290 [btrfs] [ 8948.295813] btrfs_reserve_extent+0x14b/0x280 [btrfs] [ 8948.296270] cow_file_range+0x17e/0x490 [btrfs] [ 8948.296691] btrfs_run_delalloc_range+0x345/0x7a0 [btrfs] [ 8948.297175] ? find_lock_delalloc_range+0x247/0x270 [btrfs] [ 8948.297678] writepage_delalloc+0xb5/0x170 [btrfs] [ 8948.298123] __extent_writepage+0x156/0x3c0 [btrfs] [ 8948.298570] extent_write_cache_pages+0x263/0x460 [btrfs] [ 8948.299061] extent_writepages+0x76/0x130 [btrfs] [ 8948.299495] do_writepages+0xd2/0x1c0 [ 8948.299817] ? sched_clock_cpu+0xd/0x110 [ 8948.300160] ? lock_release+0x155/0x4a0 [ 8948.300494] filemap_fdatawrite_wbc+0x68/0x90 [ 8948.300874] ? do_raw_spin_unlock+0x4b/0xa0 [ 8948.301243] start_delalloc_inodes+0x17f/0x400 [btrfs] [ 8948.301706] ? lock_release+0x155/0x4a0 [ 8948.302055] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs] [ 8948.302564] flush_space+0x1f2/0x630 [btrfs] [ 8948.302970] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs] [ 8948.303510] process_one_work+0x252/0x5a0 [ 8948.303860] ? process_one_work+0x5a0/0x5a0 [ 8948.304221] worker_thread+0x55/0x3b0 [ 8948.304543] ? process_one_work+0x5a0/0x5a0 [ 8948.304904] kthread+0xf2/0x120 [ 8948.305184] ? kthread_complete_and_exit+0x20/0x20 [ 8948.305598] ret_from_fork+0x22/0x30 [ 8948.305921] It all comes from the fact that btrfs_start_delalloc_roots() takes the delalloc_root_mutex, in the transaction commit path we are holding a read lock on one of the superblock's freeze semaphores (via sb_start_intwrite()), the async reclaim task can also do a call to btrfs_start_delalloc_roots(), which ends up triggering writeback with calls to filemap_fdatawrite_wbc(), resulting in extent allocation which in turn can call btrfs_start_transaction(), which will result in taking the freeze semaphore via sb_start_intwrite(), forming a nasty dependency on all those locks which can be taken in different orders by different code paths. So just adopt the simple approach of calling try_to_writeback_inodes_sb() at btrfs_start_delalloc_flush(). Link: https://lore.kernel.org/linux-btrfs/20220130005258.GA7465@cuci.nl/ Link: https://lore.kernel.org/linux-btrfs/43acc426-d683-d1b6-729d-c6bc4a2fff4d@gmail.com/ Link: https://lore.kernel.org/linux-btrfs/6833930a-08d7-6fbc-0141-eb9cdfd6bb4d@gmail.com/ Link: https://lore.kernel.org/linux-btrfs/20190322041731.GF16651@hungrycats.org/ Reviewed-by: Omar Sandoval Signed-off-by: Filipe Manana [ add more link reports ] Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/transaction.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f1ae5a5b79c6..e3e9c58ea66f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2014,16 +2014,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* - * We use writeback_inodes_sb here because if we used + * We use try_to_writeback_inodes_sb() here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. * Currently are holding the fs freeze lock, if we do an async flush * we'll do btrfs_join_transaction() and deadlock because we need to * wait for the fs freeze lock. Using the direct flushing we benefit * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. + * + * Note that try_to_writeback_inodes_sb() will only trigger writeback + * if it can read lock sb->s_umount. It will always be able to lock it, + * except when the filesystem is being unmounted or being frozen, but in + * those cases sync_filesystem() is called, which results in calling + * writeback_inodes_sb() while holding a write lock on sb->s_umount. + * Note that we don't call writeback_inodes_sb() directly, because it + * will emit a warning if sb->s_umount is not locked. */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } -- cgit v1.2.3 From ccf46cb68859b46ab2c1337ecf062203e7318ca2 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Sat, 12 Feb 2022 08:16:20 +1000 Subject: cifs: do not use uninitialized data in the owner/group sid [ Upstream commit 26d3dadebbcbddfaf1d9caad42527a28a0ed28d8 ] When idsfromsid is used we create a special SID for owner/group. This structure must be initialized or else the first 5 bytes of the Authority field of the SID will contain uninitialized data and thus not be a valid SID. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/cifsacl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index ee3aab3dd4ac..5df21d63dd04 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1297,7 +1297,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, if (uid_valid(uid)) { /* chown */ uid_t id; - nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), + nowner_sid_ptr = kzalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!nowner_sid_ptr) { rc = -ENOMEM; @@ -1326,7 +1326,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, } if (gid_valid(gid)) { /* chgrp */ gid_t id; - ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), + ngroup_sid_ptr = kzalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!ngroup_sid_ptr) { rc = -ENOMEM; -- cgit v1.2.3 From 546d60859ecf13380fcabcbeace53a5971493a2b Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 11 Feb 2022 02:59:15 +1000 Subject: cifs: fix double free race when mount fails in cifs_get_root() [ Upstream commit 3d6cc9898efdfb062efb74dc18cfc700e082f5d5 ] When cifs_get_root() fails during cifs_smb3_do_mount() we call deactivate_locked_super() which eventually will call delayed_free() which will free the context. In this situation we should not proceed to enter the out: section in cifs_smb3_do_mount() and free the same resources a second time. [Thu Feb 10 12:59:06 2022] BUG: KASAN: use-after-free in rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] Read of size 8 at addr ffff888364f4d110 by task swapper/1/0 [Thu Feb 10 12:59:06 2022] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G OE 5.17.0-rc3+ #4 [Thu Feb 10 12:59:06 2022] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019 [Thu Feb 10 12:59:06 2022] Call Trace: [Thu Feb 10 12:59:06 2022] [Thu Feb 10 12:59:06 2022] dump_stack_lvl+0x5d/0x78 [Thu Feb 10 12:59:06 2022] print_address_description.constprop.0+0x24/0x150 [Thu Feb 10 12:59:06 2022] ? rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] kasan_report.cold+0x7d/0x117 [Thu Feb 10 12:59:06 2022] ? rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] __asan_load8+0x86/0xa0 [Thu Feb 10 12:59:06 2022] rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] rcu_core+0x547/0xca0 [Thu Feb 10 12:59:06 2022] ? call_rcu+0x3c0/0x3c0 [Thu Feb 10 12:59:06 2022] ? __this_cpu_preempt_check+0x13/0x20 [Thu Feb 10 12:59:06 2022] ? lock_is_held_type+0xea/0x140 [Thu Feb 10 12:59:06 2022] rcu_core_si+0xe/0x10 [Thu Feb 10 12:59:06 2022] __do_softirq+0x1d4/0x67b [Thu Feb 10 12:59:06 2022] __irq_exit_rcu+0x100/0x150 [Thu Feb 10 12:59:06 2022] irq_exit_rcu+0xe/0x30 [Thu Feb 10 12:59:06 2022] sysvec_hyperv_stimer0+0x9d/0xc0 ... [Thu Feb 10 12:59:07 2022] Freed by task 58179: [Thu Feb 10 12:59:07 2022] kasan_save_stack+0x26/0x50 [Thu Feb 10 12:59:07 2022] kasan_set_track+0x25/0x30 [Thu Feb 10 12:59:07 2022] kasan_set_free_info+0x24/0x40 [Thu Feb 10 12:59:07 2022] ____kasan_slab_free+0x137/0x170 [Thu Feb 10 12:59:07 2022] __kasan_slab_free+0x12/0x20 [Thu Feb 10 12:59:07 2022] slab_free_freelist_hook+0xb3/0x1d0 [Thu Feb 10 12:59:07 2022] kfree+0xcd/0x520 [Thu Feb 10 12:59:07 2022] cifs_smb3_do_mount+0x149/0xbe0 [cifs] [Thu Feb 10 12:59:07 2022] smb3_get_tree+0x1a0/0x2e0 [cifs] [Thu Feb 10 12:59:07 2022] vfs_get_tree+0x52/0x140 [Thu Feb 10 12:59:07 2022] path_mount+0x635/0x10c0 [Thu Feb 10 12:59:07 2022] __x64_sys_mount+0x1bf/0x210 [Thu Feb 10 12:59:07 2022] do_syscall_64+0x5c/0xc0 [Thu Feb 10 12:59:07 2022] entry_SYSCALL_64_after_hwframe+0x44/0xae [Thu Feb 10 12:59:07 2022] Last potentially related work creation: [Thu Feb 10 12:59:07 2022] kasan_save_stack+0x26/0x50 [Thu Feb 10 12:59:07 2022] __kasan_record_aux_stack+0xb6/0xc0 [Thu Feb 10 12:59:07 2022] kasan_record_aux_stack_noalloc+0xb/0x10 [Thu Feb 10 12:59:07 2022] call_rcu+0x76/0x3c0 [Thu Feb 10 12:59:07 2022] cifs_umount+0xce/0xe0 [cifs] [Thu Feb 10 12:59:07 2022] cifs_kill_sb+0xc8/0xe0 [cifs] [Thu Feb 10 12:59:07 2022] deactivate_locked_super+0x5d/0xd0 [Thu Feb 10 12:59:07 2022] cifs_smb3_do_mount+0xab9/0xbe0 [cifs] [Thu Feb 10 12:59:07 2022] smb3_get_tree+0x1a0/0x2e0 [cifs] [Thu Feb 10 12:59:07 2022] vfs_get_tree+0x52/0x140 [Thu Feb 10 12:59:07 2022] path_mount+0x635/0x10c0 [Thu Feb 10 12:59:07 2022] __x64_sys_mount+0x1bf/0x210 [Thu Feb 10 12:59:07 2022] do_syscall_64+0x5c/0xc0 [Thu Feb 10 12:59:07 2022] entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: Shyam Prasad N Reviewed-by: Shyam Prasad N Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/cifsfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 9fa930dfd78d..21bf82fc2278 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -909,6 +909,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, out_super: deactivate_locked_super(sb); + return root; out: if (cifs_sb) { kfree(cifs_sb->prepath); -- cgit v1.2.3 From 54e7951a1988ea710eabe7a1ea5f11010906f0e1 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 14 Feb 2022 08:40:52 +1000 Subject: cifs: modefromsids must add an ACE for authenticated users [ Upstream commit 0c6f4ebf8835d01866eb686d47578cde80097981 ] When we create a file with modefromsids we set an ACL that has one ACE for the magic modefromsid as well as a second ACE that grants full access to all authenticated users. When later we chante the mode on the file we strip away this, and other, ACE for authenticated users in set_chmod_dacl() and then just add back/update the modefromsid ACE. Thus leaving the file with a single ACE that is for the mode and no ACE to grant any user any rights to access the file. Fix this by always adding back also the modefromsid ACE so that we do not drop the rights to access the file. Signed-off-by: Ronnie Sahlberg Reviewed-by: Shyam Prasad N Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/cifsacl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 5df21d63dd04..bf861fef2f0c 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -949,6 +949,9 @@ static void populate_new_aces(char *nacl_base, pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += setup_special_mode_ACE(pnntace, nmode); num_aces++; + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += setup_authusers_ACE(pnntace); + num_aces++; goto set_size; } @@ -1613,7 +1616,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, nsecdesclen = secdesclen; if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ if (mode_from_sid) - nsecdesclen += sizeof(struct cifs_ace); + nsecdesclen += 2 * sizeof(struct cifs_ace); else /* cifsacl */ nsecdesclen += 5 * sizeof(struct cifs_ace); } else { /* chown */ -- cgit v1.2.3 From 1ffc130388c5065d491ca58fb41bc248fd63a2f5 Mon Sep 17 00:00:00 2001 From: Christophe Vu-Brugier Date: Tue, 2 Nov 2021 22:23:58 +0100 Subject: exfat: reuse exfat_inode_info variable instead of calling EXFAT_I() [ Upstream commit 7dee6f57d7f22a89dd214518c778aec448270d4c ] Also add a local "struct exfat_inode_info *ei" variable to exfat_truncate() to simplify the code. Signed-off-by: Christophe Vu-Brugier Signed-off-by: Namjae Jeon Signed-off-by: Sasha Levin --- fs/exfat/file.c | 14 +++++++------- fs/exfat/inode.c | 9 ++++----- fs/exfat/namei.c | 6 +++--- fs/exfat/super.c | 6 +++--- 4 files changed, 17 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 6af0191b648f..848166d6d5e9 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -110,8 +110,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) exfat_set_volume_dirty(sb); num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi); - num_clusters_phys = - EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, sbi); + num_clusters_phys = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi); exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags); @@ -228,12 +227,13 @@ void exfat_truncate(struct inode *inode, loff_t size) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); unsigned int blocksize = i_blocksize(inode); loff_t aligned_size; int err; mutex_lock(&sbi->s_lock); - if (EXFAT_I(inode)->start_clu == 0) { + if (ei->start_clu == 0) { /* * Empty start_clu != ~0 (not allocated) */ @@ -260,11 +260,11 @@ write_size: aligned_size++; } - if (EXFAT_I(inode)->i_size_ondisk > i_size_read(inode)) - EXFAT_I(inode)->i_size_ondisk = aligned_size; + if (ei->i_size_ondisk > i_size_read(inode)) + ei->i_size_ondisk = aligned_size; - if (EXFAT_I(inode)->i_size_aligned > i_size_read(inode)) - EXFAT_I(inode)->i_size_aligned = aligned_size; + if (ei->i_size_aligned > i_size_read(inode)) + ei->i_size_aligned = aligned_size; mutex_unlock(&sbi->s_lock); } diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 1c7aa1ea4724..aca2e64d045b 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -114,10 +114,9 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, unsigned int local_clu_offset = clu_offset; unsigned int num_to_be_allocated = 0, num_clusters = 0; - if (EXFAT_I(inode)->i_size_ondisk > 0) + if (ei->i_size_ondisk > 0) num_clusters = - EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, - sbi); + EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi); if (clu_offset >= num_clusters) num_to_be_allocated = clu_offset - num_clusters + 1; @@ -416,10 +415,10 @@ static int exfat_write_end(struct file *file, struct address_space *mapping, err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); - if (EXFAT_I(inode)->i_size_aligned < i_size_read(inode)) { + if (ei->i_size_aligned < i_size_read(inode)) { exfat_fs_error(inode->i_sb, "invalid size(size(%llu) > aligned(%llu)\n", - i_size_read(inode), EXFAT_I(inode)->i_size_aligned); + i_size_read(inode), ei->i_size_aligned); return -EIO; } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 24b41103d1cc..9d8ada781250 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -395,9 +395,9 @@ static int exfat_find_empty_entry(struct inode *inode, /* directory inode should be updated in here */ i_size_write(inode, size); - EXFAT_I(inode)->i_size_ondisk += sbi->cluster_size; - EXFAT_I(inode)->i_size_aligned += sbi->cluster_size; - EXFAT_I(inode)->flags = p_dir->flags; + ei->i_size_ondisk += sbi->cluster_size; + ei->i_size_aligned += sbi->cluster_size; + ei->flags = p_dir->flags; inode->i_blocks += 1 << sbi->sect_per_clus_bits; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 5539ffc20d16..1a2115d73a48 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -366,9 +366,9 @@ static int exfat_read_root(struct inode *inode) inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & ~(sbi->cluster_size - 1)) >> inode->i_blkbits; - EXFAT_I(inode)->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; - EXFAT_I(inode)->i_size_aligned = i_size_read(inode); - EXFAT_I(inode)->i_size_ondisk = i_size_read(inode); + ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; + ei->i_size_aligned = i_size_read(inode); + ei->i_size_ondisk = i_size_read(inode); exfat_save_attr(inode, ATTR_SUBDIR); inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = -- cgit v1.2.3 From c9f727219f3e64b63e03e00dc0cf3d685c17f730 Mon Sep 17 00:00:00 2001 From: Christophe Vu-Brugier Date: Mon, 22 Nov 2021 22:02:37 +0900 Subject: exfat: fix i_blocks for files truncated over 4 GiB [ Upstream commit 92fba084b79e6bc7b12fc118209f1922c1a2df56 ] In exfat_truncate(), the computation of inode->i_blocks is wrong if the file is larger than 4 GiB because a 32-bit variable is used as a mask. This is fixed and simplified by using round_up(). Also fix the same buggy computation in exfat_read_root() and another (correct) one in exfat_fill_inode(). The latter was fixed another way last month but can be simplified by using round_up() as well. See: commit 0c336d6e33f4 ("exfat: fix incorrect loading of i_blocks for large files") Fixes: 98d917047e8b ("exfat: add file operations") Cc: stable@vger.kernel.org # v5.7+ Suggested-by: Matthew Wilcox Reviewed-by: Sungjong Seo Signed-off-by: Christophe Vu-Brugier Signed-off-by: Namjae Jeon Signed-off-by: Sasha Levin --- fs/exfat/file.c | 4 ++-- fs/exfat/inode.c | 4 ++-- fs/exfat/super.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 848166d6d5e9..d890fd34bb2d 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -251,8 +251,8 @@ void exfat_truncate(struct inode *inode, loff_t size) else mark_inode_dirty(inode); - inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & - ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> + inode->i_blkbits; write_size: aligned_size = i_size_read(inode); if (aligned_size & (blocksize - 1)) { diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index aca2e64d045b..72a0ccfb616c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -602,8 +602,8 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) exfat_save_attr(inode, info->attr); - inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & - ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> + inode->i_blkbits; inode->i_mtime = info->mtime; inode->i_ctime = info->mtime; ei->i_crtime = info->crtime; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 1a2115d73a48..4b5d02b1df58 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -364,8 +364,8 @@ static int exfat_read_root(struct inode *inode) inode->i_op = &exfat_dir_inode_operations; inode->i_fop = &exfat_dir_operations; - inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) - & ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> + inode->i_blkbits; ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; ei->i_size_aligned = i_size_read(inode); ei->i_size_ondisk = i_size_read(inode); -- cgit v1.2.3 From 5abb1d84b6dba66513e7b451945dad999eba253a Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 23 Dec 2021 12:21:38 -0800 Subject: ext4: drop ineligible txn start stop APIs [ Upstream commit 7bbbe241ec7ce0def9f71464c878fdbd2b0dcf37 ] This patch drops ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() APIs. Fast commit ineligible transactions should simply call ext4_fc_mark_ineligible() after starting the trasaction. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20211223202140.2061101-3-harshads@google.com Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/ext4.h | 6 ++-- fs/ext4/extents.c | 6 ++-- fs/ext4/fast_commit.c | 79 ++++++++++----------------------------------------- fs/ext4/ioctl.c | 3 +- fs/ext4/super.c | 1 - 5 files changed, 20 insertions(+), 75 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7ebc816ae39f..58829af68fed 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1724,9 +1724,9 @@ struct ext4_sb_info { */ struct work_struct s_error_work; - /* Ext4 fast commit stuff */ + /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; - atomic_t s_fc_ineligible_updates; + /* * After commit starts, the main queue gets locked, and the further * updates get added in the staging queue. @@ -2925,8 +2925,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason); -void ext4_fc_start_ineligible(struct super_block *sb, int reason); -void ext4_fc_stop_ineligible(struct super_block *sb); void ext4_fc_start_update(struct inode *inode); void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c35cb6d9b7b5..c19813402ce9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5340,7 +5340,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@ -5379,7 +5379,6 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: @@ -5481,7 +5480,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; @@ -5556,7 +5555,6 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 7bcd3be07ee4..43da096d9b85 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -65,21 +65,11 @@ * * Fast Commit Ineligibility * ------------------------- - * Not all operations are supported by fast commits today (e.g extended - * attributes). Fast commit ineligibility is marked by calling one of the - * two following functions: - * - * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall - * back to full commit. This is useful in case of transient errors. * - * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all - * the fast commits happening between ext4_fc_start_ineligible() and - * ext4_fc_stop_ineligible() and one fast commit after the call to - * ext4_fc_stop_ineligible() to fall back to full commits. It is important to - * make one more fast commit to fall back to full commit after stop call so - * that it guaranteed that the fast commit ineligible operation contained - * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is - * followed by at least 1 full commit. + * Not all operations are supported by fast commits today (e.g extended + * attributes). Fast commit ineligibility is marked by calling + * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back + * to full commit. * * Atomicity of commits * -------------------- @@ -328,44 +318,6 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason) sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } -/* - * Start a fast commit ineligible update. Any commits that happen while - * such an operation is in progress fall back to full commits. - */ -void ext4_fc_start_ineligible(struct super_block *sb, int reason) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) - return; - - WARN_ON(reason >= EXT4_FC_REASON_MAX); - sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; - atomic_inc(&sbi->s_fc_ineligible_updates); -} - -/* - * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here - * to ensure that after stopping the ineligible update, at least one full - * commit takes place. - */ -void ext4_fc_stop_ineligible(struct super_block *sb) -{ - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) - return; - - ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); -} - -static inline int ext4_fc_is_ineligible(struct super_block *sb) -{ - return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || - atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); -} - /* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call @@ -391,7 +343,7 @@ static int ext4_fc_track_template( (sbi->s_mount_state & EXT4_FC_REPLAY)) return -EOPNOTSUPP; - if (ext4_fc_is_ineligible(inode->i_sb)) + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return -EINVAL; tid = handle->h_transaction->t_tid; @@ -1140,11 +1092,8 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) start_time = ktime_get(); - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (ext4_fc_is_ineligible(sb))) { - reason = EXT4_FC_REASON_INELIGIBLE; - goto out; - } + if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) + return jbd2_complete_transaction(journal, commit_tid); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); @@ -1160,6 +1109,14 @@ restart_fc: reason = EXT4_FC_REASON_FC_START_FAILED; goto out; } + /* + * After establishing journal barrier via jbd2_fc_begin_commit(), check + * if we are fast commit ineligible. + */ + if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { + reason = EXT4_FC_REASON_INELIGIBLE; + goto out; + } fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); @@ -1178,12 +1135,6 @@ restart_fc: atomic_inc(&sbi->s_fc_subtid); jbd2_fc_end_commit(journal); out: - /* Has any ineligible update happened since we started? */ - if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_INELIGIBLE; - } - spin_lock(&sbi->s_fc_lock); if (reason != EXT4_FC_REASON_OK && reason != EXT4_FC_REASON_ALREADY_COMMITTED) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 220a4c8178b5..fd70bebb1437 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -169,7 +169,7 @@ static long swap_inode_boot_loader(struct super_block *sb, err = -EINVAL; goto err_out; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); @@ -252,7 +252,6 @@ revert: err_out1: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); ext4_double_up_write_data_sem(inode, inode_bl); err_out: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 877c5c17e61f..538a33c3b117 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4613,7 +4613,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); - atomic_set(&sbi->s_fc_ineligible_updates, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); -- cgit v1.2.3 From 647b3f1533f40191c85b04b4184a89cbe6b00981 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 23 Dec 2021 12:21:39 -0800 Subject: ext4: simplify updating of fast commit stats [ Upstream commit 0915e464cb274648e1ef1663e1356e53ff400983 ] Move fast commit stats updating logic to a separate function from ext4_fc_commit(). This significantly improves readability of ext4_fc_commit(). Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20211223202140.2061101-4-harshads@google.com Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/ext4.h | 1 - fs/ext4/fast_commit.c | 99 ++++++++++++++++++++++++++++----------------------- fs/ext4/fast_commit.h | 27 +++++++------- 3 files changed, 68 insertions(+), 59 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 58829af68fed..7448e24cd545 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1746,7 +1746,6 @@ struct ext4_sb_info { spinlock_t s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; - u64 s_fc_avg_commit_time; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 43da096d9b85..f8915dff82ee 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1073,6 +1073,32 @@ out: return ret; } +static void ext4_fc_update_stats(struct super_block *sb, int status, + u64 commit_time, int nblks) +{ + struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; + + jbd_debug(1, "Fast commit ended with status = %d", status); + if (status == EXT4_FC_STATUS_OK) { + stats->fc_num_commits++; + stats->fc_numblks += nblks; + if (likely(stats->s_fc_avg_commit_time)) + stats->s_fc_avg_commit_time = + (commit_time + + stats->s_fc_avg_commit_time * 3) / 4; + else + stats->s_fc_avg_commit_time = commit_time; + } else if (status == EXT4_FC_STATUS_FAILED || + status == EXT4_FC_STATUS_INELIGIBLE) { + if (status == EXT4_FC_STATUS_FAILED) + stats->fc_failed_commits++; + stats->fc_ineligible_commits++; + } else { + stats->fc_skipped_commits++; + } + trace_ext4_fc_commit_stop(sb, nblks, status); +} + /* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit @@ -1085,7 +1111,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); - int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; + int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; trace_ext4_fc_commit_start(sb); @@ -1102,69 +1128,52 @@ restart_fc: if (atomic_read(&sbi->s_fc_subtid) <= subtid && commit_tid > journal->j_commit_sequence) goto restart_fc; - reason = EXT4_FC_REASON_ALREADY_COMMITTED; - goto out; + ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0); + return 0; } else if (ret) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_FC_START_FAILED; - goto out; + /* + * Commit couldn't start. Just update stats and perform a + * full commit. + */ + ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0); + return jbd2_complete_transaction(journal, commit_tid); } + /* * After establishing journal barrier via jbd2_fc_begin_commit(), check * if we are fast commit ineligible. */ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { - reason = EXT4_FC_REASON_INELIGIBLE; - goto out; + status = EXT4_FC_STATUS_INELIGIBLE; + goto fallback; } fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_FC_FAILED; - goto out; + status = EXT4_FC_STATUS_FAILED; + goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_FC_FAILED; - goto out; + status = EXT4_FC_STATUS_FAILED; + goto fallback; } atomic_inc(&sbi->s_fc_subtid); - jbd2_fc_end_commit(journal); -out: - spin_lock(&sbi->s_fc_lock); - if (reason != EXT4_FC_REASON_OK && - reason != EXT4_FC_REASON_ALREADY_COMMITTED) { - sbi->s_fc_stats.fc_ineligible_commits++; - } else { - sbi->s_fc_stats.fc_num_commits++; - sbi->s_fc_stats.fc_numblks += nblks; - } - spin_unlock(&sbi->s_fc_lock); - nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; - trace_ext4_fc_commit_stop(sb, nblks, reason); - commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + ret = jbd2_fc_end_commit(journal); /* - * weight the commit time higher than the average time so we don't - * react too strongly to vast changes in the commit time + * weight the commit time higher than the average time so we + * don't react too strongly to vast changes in the commit time */ - if (likely(sbi->s_fc_avg_commit_time)) - sbi->s_fc_avg_commit_time = (commit_time + - sbi->s_fc_avg_commit_time * 3) / 4; - else - sbi->s_fc_avg_commit_time = commit_time; - jbd_debug(1, - "Fast commit ended with blks = %d, reason = %d, subtid - %d", - nblks, reason, subtid); - if (reason == EXT4_FC_REASON_FC_FAILED) - return jbd2_fc_end_commit_fallback(journal); - if (reason == EXT4_FC_REASON_FC_START_FAILED || - reason == EXT4_FC_REASON_INELIGIBLE) - return jbd2_complete_transaction(journal, commit_tid); - return 0; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + ext4_fc_update_stats(sb, status, commit_time, nblks); + return ret; + +fallback: + ret = jbd2_fc_end_commit_fallback(journal); + ext4_fc_update_stats(sb, status, 0, 0); + return ret; } /* @@ -2125,7 +2134,7 @@ int ext4_fc_info_show(struct seq_file *seq, void *v) "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, stats->fc_numblks, - div_u64(sbi->s_fc_avg_commit_time, 1000)); + div_u64(stats->s_fc_avg_commit_time, 1000)); seq_puts(seq, "Ineligible reasons:\n"); for (i = 0; i < EXT4_FC_REASON_MAX; i++) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 937c381b4c85..083ad1cb705a 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -71,21 +71,19 @@ struct ext4_fc_tail { }; /* - * Fast commit reason codes + * Fast commit status codes + */ +enum { + EXT4_FC_STATUS_OK = 0, + EXT4_FC_STATUS_INELIGIBLE, + EXT4_FC_STATUS_SKIPPED, + EXT4_FC_STATUS_FAILED, +}; + +/* + * Fast commit ineligiblity reasons: */ enum { - /* - * Commit status codes: - */ - EXT4_FC_REASON_OK = 0, - EXT4_FC_REASON_INELIGIBLE, - EXT4_FC_REASON_ALREADY_COMMITTED, - EXT4_FC_REASON_FC_START_FAILED, - EXT4_FC_REASON_FC_FAILED, - - /* - * Fast commit ineligiblity reasons: - */ EXT4_FC_REASON_XATTR = 0, EXT4_FC_REASON_CROSS_RENAME, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, @@ -117,7 +115,10 @@ struct ext4_fc_stats { unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX]; unsigned long fc_num_commits; unsigned long fc_ineligible_commits; + unsigned long fc_failed_commits; + unsigned long fc_skipped_commits; unsigned long fc_numblks; + u64 s_fc_avg_commit_time; }; #define EXT4_FC_REPLAY_REALLOC_INCREMENT 4 -- cgit v1.2.3 From 97abcfedc87cb501071e8947184ee0c5acca6874 Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 17 Jan 2022 17:36:54 +0800 Subject: ext4: fast commit may not fallback for ineligible commit [ Upstream commit e85c81ba8859a4c839bcd69c5d83b32954133a5b ] For the follow scenario: 1. jbd start commit transaction n 2. task A get new handle for transaction n+1 3. task A do some ineligible actions and mark FC_INELIGIBLE 4. jbd complete transaction n and clean FC_INELIGIBLE 5. task A call fsync In this case fast commit will not fallback to full commit and transaction n+1 also not handled by jbd. Make ext4_fc_mark_ineligible() also record transaction tid for latest ineligible case, when call ext4_fc_cleanup() check current transaction tid, if small than latest ineligible tid do not clear the EXT4_MF_FC_INELIGIBLE. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: Ritesh Harjani Suggested-by: Harshad Shirwadkar Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220117093655.35160-2-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Sasha Levin --- fs/ext4/ext4.h | 3 ++- fs/ext4/extents.c | 4 ++-- fs/ext4/fast_commit.c | 33 +++++++++++++++++++++++++-------- fs/ext4/inode.c | 4 ++-- fs/ext4/ioctl.c | 4 ++-- fs/ext4/namei.c | 4 ++-- fs/ext4/super.c | 1 + fs/ext4/xattr.c | 6 +++--- fs/jbd2/commit.c | 2 +- fs/jbd2/journal.c | 2 +- include/linux/jbd2.h | 2 +- 11 files changed, 42 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7448e24cd545..2b67c7d1677b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1746,6 +1746,7 @@ struct ext4_sb_info { spinlock_t s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; + tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif @@ -2923,7 +2924,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); -void ext4_fc_mark_ineligible(struct super_block *sb, int reason); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_start_update(struct inode *inode); void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c19813402ce9..b81c008e6675 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5340,7 +5340,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@ -5480,7 +5480,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f8915dff82ee..96fac6ac18f4 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -302,18 +302,32 @@ restart: } /* - * Mark file system as fast commit ineligible. This means that next commit - * operation would result in a full jbd2 commit. + * Mark file system as fast commit ineligible, and record latest + * ineligible transaction tid. This means until the recorded + * transaction, commit operation would result in a full jbd2 commit. */ -void ext4_fc_mark_ineligible(struct super_block *sb, int reason) +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); + tid_t tid; if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (handle && !IS_ERR(handle)) + tid = handle->h_transaction->t_tid; + else { + read_lock(&sbi->s_journal->j_state_lock); + tid = sbi->s_journal->j_running_transaction ? + sbi->s_journal->j_running_transaction->t_tid : 0; + read_unlock(&sbi->s_journal->j_state_lock); + } + spin_lock(&sbi->s_fc_lock); + if (sbi->s_fc_ineligible_tid < tid) + sbi->s_fc_ineligible_tid = tid; + spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -389,7 +403,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -402,7 +416,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM); + EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -504,7 +518,7 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_INODE_JOURNAL_DATA); + EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } @@ -1180,7 +1194,7 @@ fallback: * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ -static void ext4_fc_cleanup(journal_t *journal, int full) +static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1228,7 +1242,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full) &sbi->s_fc_q[FC_Q_MAIN]); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (tid >= sbi->s_fc_ineligible_tid) { + sbi->s_fc_ineligible_tid = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + } if (full) sbi->s_fc_bytes = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b6746cc86cee..22a5140546fb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -337,7 +337,7 @@ stop_handle: return; no_delete: if (!list_empty(&EXT4_I(inode)->i_fc_list)) - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } @@ -5969,7 +5969,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index fd70bebb1437..f61b59045c6d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -169,7 +169,7 @@ static long swap_inode_boot_loader(struct super_block *sb, err = -EINVAL; goto err_out; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); @@ -1075,7 +1075,7 @@ mext_out: err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index da7698341d7d..192ea90e757c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3889,7 +3889,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, - EXT4_FC_REASON_RENAME_DIR); + EXT4_FC_REASON_RENAME_DIR, handle); } else { if (new.inode) ext4_fc_track_unlink(handle, new.dentry); @@ -4049,7 +4049,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, - EXT4_FC_REASON_CROSS_RENAME); + EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 538a33c3b117..eb5d55cf277c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4620,6 +4620,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1e0fc1ed845b..042325349098 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2408,7 +2408,7 @@ retry_inode: if (IS_SYNC(inode)) ext4_handle_sync(handle); } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); @@ -2486,7 +2486,7 @@ retry: if (error == 0) error = error2; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL); return error; } @@ -2920,7 +2920,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, error); goto cleanup; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3cc4ab2ba7f4..d188fa913a07 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1170,7 +1170,7 @@ restart_loop: if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 1); + journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bd9ac9891604..1f8493ef181d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -769,7 +769,7 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit); static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 0); + journal->j_fc_cleanup_callback(journal, 0, tid); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fd933c45281a..d63b8106796e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1295,7 +1295,7 @@ struct journal_s * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ - void (*j_fc_cleanup_callback)(struct journal_s *journal, int); + void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: -- cgit v1.2.3 From 6f6ffc717b852b6aad4dd3618b0ee16f7be0829a Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 17 Jan 2022 17:36:55 +0800 Subject: ext4: fast commit may miss file actions [ Upstream commit bdc8a53a6f2f0b1cb5f991440f2100732299eb93 ] in the follow scenario: 1. jbd start transaction n 2. task A get new handle for transaction n+1 3. task A do some actions and add inode to FC_Q_MAIN fc_q 4. jbd complete transaction n and clear FC_Q_MAIN fc_q 5. task A call fsync Fast commit will lost the file actions during a full commit. we should also add updates to staging queue during a full commit. and in ext4_fc_cleanup(), when reset a inode's fc track range, check it's i_sync_tid, if it bigger than current transaction tid, do not rest it, or we will lost the track range. And EXT4_MF_FC_COMMITTING is not needed anymore, so drop it. Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220117093655.35160-3-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Sasha Levin --- fs/ext4/ext4.h | 5 +---- fs/ext4/fast_commit.c | 11 ++++++----- fs/ext4/super.c | 1 - 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2b67c7d1677b..db981619f6c8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1792,10 +1792,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FS_ABORTED, /* Fatal error detected */ - EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ - EXT4_MF_FC_COMMITTING /* File system underoing a fast - * commit. - */ + EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 96fac6ac18f4..285c91b0166c 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -377,7 +377,8 @@ static int ext4_fc_track_template( spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, - (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? + (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); @@ -430,7 +431,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) node->fcd_name.len = dentry->d_name.len; spin_lock(&sbi->s_fc_lock); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) + if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else @@ -894,7 +896,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) int ret = 0; spin_lock(&sbi->s_fc_lock); - ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { @@ -1212,7 +1213,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); - ext4_fc_reset_inode(&iter->vfs_inode); + if (iter->i_sync_tid <= tid) + ext4_fc_reset_inode(&iter->vfs_inode); /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ smp_mb(); #if (BITS_PER_LONG < 64) @@ -1241,7 +1243,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); if (tid >= sbi->s_fc_ineligible_tid) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eb5d55cf277c..fd4d34deb9fc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4619,7 +4619,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); -- cgit v1.2.3 From 2de88544b3dbe9f4be3220750e34c97dd86386c1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 30 Sep 2021 17:06:21 -0400 Subject: NFSD: Have legacy NFSD WRITE decoders use xdr_stream_subsegment() [ Upstream commit dae9a6cab8009e526570e7477ce858dcdfeb256e ] Refactor. Now that the NFSv2 and NFSv3 XDR decoders have been converted to use xdr_streams, the WRITE decoder functions can use xdr_stream_subsegment() to extract the WRITE payload into its own xdr_buf, just as the NFSv4 WRITE XDR decoder currently does. That makes it possible to pass the first kvec, pages array + length, page_base, and total payload length via a single function parameter. The payload's page_base is not yet assigned or used, but will be in subsequent patches. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Sasha Levin --- fs/nfsd/nfs3proc.c | 3 +-- fs/nfsd/nfs3xdr.c | 12 ++---------- fs/nfsd/nfs4proc.c | 3 +-- fs/nfsd/nfsproc.c | 3 +-- fs/nfsd/nfsxdr.c | 9 +-------- fs/nfsd/xdr.h | 2 +- fs/nfsd/xdr3.h | 2 +- include/linux/sunrpc/svc.h | 3 +-- net/sunrpc/svc.c | 11 ++++++----- 9 files changed, 15 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9918d6ad23ec..354bbfcd96aa 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -210,8 +210,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, - &argp->first, cnt); + nvecs = svc_fill_write_vector(rqstp, &argp->payload); if (!nvecs) { resp->status = nfserr_io; goto out; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 7a900131d20c..0ee156b9c9d7 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -621,9 +621,6 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_writeargs *args = rqstp->rq_argp; u32 max_blocksize = svc_max_payload(rqstp); - struct kvec *head = rqstp->rq_arg.head; - struct kvec *tail = rqstp->rq_arg.tail; - size_t remaining; if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) return 0; @@ -641,17 +638,12 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) /* request sanity */ if (args->count != args->len) return 0; - remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; - remaining -= xdr_stream_pos(xdr); - if (remaining < xdr_align_size(args->len)) - return 0; if (args->count > max_blocksize) { args->count = max_blocksize; args->len = max_blocksize; } - - args->first.iov_base = xdr->p; - args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + if (!xdr_stream_subsegment(xdr, &args->payload, args->count)) + return 0; return 1; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4b9a3b90a41f..65200910107f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1038,8 +1038,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_how_written = write->wr_stable_how; - nvecs = svc_fill_write_vector(rqstp, write->wr_payload.pages, - write->wr_payload.head, write->wr_buflen); + nvecs = svc_fill_write_vector(rqstp, &write->wr_payload); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 19c568b8a527..de282f3273c5 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -234,8 +234,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, - &argp->first, cnt); + nvecs = svc_fill_write_vector(rqstp, &argp->payload); if (!nvecs) { resp->status = nfserr_io; goto out; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index a06c05fe3b42..26a42f87c240 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -325,10 +325,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_writeargs *args = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - struct kvec *tail = rqstp->rq_arg.tail; u32 beginoffset, totalcount; - size_t remaining; if (!svcxdr_decode_fhandle(xdr, &args->fh)) return 0; @@ -346,12 +343,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) return 0; if (args->len > NFSSVC_MAXBLKSIZE_V2) return 0; - remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; - remaining -= xdr_stream_pos(xdr); - if (remaining < xdr_align_size(args->len)) + if (!xdr_stream_subsegment(xdr, &args->payload, args->len)) return 0; - args->first.iov_base = xdr->p; - args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); return 1; } diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index f45b4bc93f52..80fd6d7f3404 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -33,7 +33,7 @@ struct nfsd_writeargs { svc_fh fh; __u32 offset; int len; - struct kvec first; + struct xdr_buf payload; }; struct nfsd_createargs { diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 933008382bbe..712c117300cb 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -40,7 +40,7 @@ struct nfsd3_writeargs { __u32 count; int stable; __u32 len; - struct kvec first; + struct xdr_buf payload; }; struct nfsd3_createargs { diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 064c96157d1f..6263410c948a 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -532,8 +532,7 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - struct page **pages, - struct kvec *first, size_t total); + struct xdr_buf *payload); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a3bbe5ce4570..08ca797bb8a4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1676,16 +1676,17 @@ EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** * svc_fill_write_vector - Construct data argument for VFS write call * @rqstp: svc_rqst to operate on - * @pages: list of pages containing data payload - * @first: buffer containing first section of write payload - * @total: total number of bytes of write payload + * @payload: xdr_buf containing only the write data payload * * Fills in rqstp::rq_vec, and returns the number of elements. */ -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages, - struct kvec *first, size_t total) +unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, + struct xdr_buf *payload) { + struct page **pages = payload->pages; + struct kvec *first = payload->head; struct kvec *vec = rqstp->rq_vec; + size_t total = payload->len; unsigned int i; /* Some types of transport can present the write payload -- cgit v1.2.3 From 384d1b11382b4d8065a1c3ce930e88c5193f9857 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Dec 2021 11:52:06 -0500 Subject: NFSD: Fix zero-length NFSv3 WRITEs [ Upstream commit 6a2f774424bfdcc2df3e17de0cefe74a4269cad5 ] The Linux NFS server currently responds to a zero-length NFSv3 WRITE request with NFS3ERR_IO. It responds to a zero-length NFSv4 WRITE with NFS4_OK and count of zero. RFC 1813 says of the WRITE procedure's @count argument: count The number of bytes of data to be written. If count is 0, the WRITE will succeed and return a count of 0, barring errors due to permissions checking. RFC 8881 has similar language for NFSv4, though NFSv4 removed the explicit @count argument because that value is already contained in the opaque payload array. The synthetic client pynfs's WRT4 and WRT15 tests do emit zero- length WRITEs to exercise this spec requirement. Commit fdec6114ee1f ("nfsd4: zero-length WRITE should succeed") addressed the same problem there with the same fix. But interestingly the Linux NFS client does not appear to emit zero- length WRITEs, instead squelching them. I'm not aware of a test that can generate such WRITEs for NFSv3, so I wrote a naive C program to generate a zero-length WRITE and test this fix. Fixes: 8154ef2776aa ("NFSD: Clean up legacy NFS WRITE argument XDR decoders") Reported-by: Trond Myklebust Signed-off-by: Chuck Lever Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/nfsd/nfs3proc.c | 6 +----- fs/nfsd/nfsproc.c | 5 ----- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 354bbfcd96aa..b540489ea240 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -211,15 +211,11 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; nvecs = svc_fill_write_vector(rqstp, &argp->payload); - if (!nvecs) { - resp->status = nfserr_io; - goto out; - } + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, nvecs, &cnt, resp->committed, resp->verf); resp->count = cnt; -out: return rpc_success; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index de282f3273c5..312fd289be58 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -235,10 +235,6 @@ nfsd_proc_write(struct svc_rqst *rqstp) argp->len, argp->offset); nvecs = svc_fill_write_vector(rqstp, &argp->payload); - if (!nvecs) { - resp->status = nfserr_io; - goto out; - } resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, nvecs, @@ -247,7 +243,6 @@ nfsd_proc_write(struct svc_rqst *rqstp) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) return rpc_drop_reply; -out: return rpc_success; } -- cgit v1.2.3 From 1bd12b7aaee0c1c818101de4a46bb56b8fdc8ccb Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 25 Nov 2021 17:21:02 +0800 Subject: io_uring: fix no lock protection for ctx->cq_extra [ Upstream commit e302f1046f4c209291b07ff7bc4d15ca26891f16 ] ctx->cq_extra should be protected by completion lock so that the req_need_defer() does the right check. Cc: stable@vger.kernel.org Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20211125092103.224502-2-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index d7e49e87b49b..156c54ebb62b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6573,11 +6573,14 @@ static bool io_drain_req(struct io_kiocb *req) } /* Still need defer if there is pending req in defer list. */ + spin_lock(&ctx->completion_lock); if (likely(list_empty_careful(&ctx->defer_list) && !(req->flags & REQ_F_IO_DRAIN))) { + spin_unlock(&ctx->completion_lock); ctx->drain_active = false; return false; } + spin_unlock(&ctx->completion_lock); seq = io_get_sequence(req); /* Still a chance to pass the sequence check */ -- cgit v1.2.3 From 3abe2a70f5b4fb5922f8efbf7ba94ba9948d2abe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 28 Dec 2021 12:35:43 -0500 Subject: NFSD: Fix verifier returned in stable WRITEs [ Upstream commit f11ad7aa653130b71e2e89bed207f387718216d5 ] RFC 8881 explains the purpose of the write verifier this way: > The final portion of the result is the field writeverf. This field > is the write verifier and is a cookie that the client can use to > determine whether a server has changed instance state (e.g., server > restart) between a call to WRITE and a subsequent call to either > WRITE or COMMIT. But then it says: > This cookie MUST be unchanged during a single instance of the > NFSv4.1 server and MUST be unique between instances of the NFSv4.1 > server. If the cookie changes, then the client MUST assume that > any data written with an UNSTABLE4 value for committed and an old > writeverf in the reply has been lost and will need to be > recovered. RFC 1813 has similar language for NFSv3. NFSv2 does not have a write verifier since it doesn't implement the COMMIT procedure. Since commit 19e0663ff9bc ("nfsd: Ensure sampling of the write verifier is atomic with the write"), the Linux NFS server has returned a boot-time-based verifier for UNSTABLE WRITEs, but a zero verifier for FILE_SYNC and DATA_SYNC WRITEs. FILE_SYNC and DATA_SYNC WRITEs are not followed up with a COMMIT, so there's no need for clients to compare verifiers for stable writes. However, by returning a different verifier for stable and unstable writes, the above commit puts the Linux NFS server a step farther out of compliance with the first MUST above. At least one NFS client (FreeBSD) noticed the difference, making this a potential regression. Reported-by: Rick Macklem Link: https://lore.kernel.org/linux-nfs/YQXPR0101MB096857EEACF04A6DF1FC6D9BDD749@YQXPR0101MB0968.CANPRD01.PROD.OUTLOOK.COM/T/ Fixes: 19e0663ff9bc ("nfsd: Ensure sampling of the write verifier is atomic with the write") Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/nfsd/vfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 78df03843412..271f7c15d6e5 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -993,6 +993,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); if (flags & RWF_SYNC) { down_write(&nf->nf_rwsem); + if (verf) + nfsd_copy_boot_verifier(verf, + net_generic(SVC_NET(rqstp), + nfsd_net_id)); host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), -- cgit v1.2.3 From 0f84cfb465af7f613fcb284f7a2268be5f71c866 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 24 Dec 2021 14:22:28 -0500 Subject: Revert "nfsd: skip some unnecessary stats in the v4 case" [ Upstream commit 58f258f65267542959487dbe8b5641754411843d ] On the wire, I observed NFSv4 OPEN(CREATE) operations sometimes returning a reasonable-looking value in the cinfo.before field and zero in the cinfo.after field. RFC 8881 Section 10.8.1 says: > When a client is making changes to a given directory, it needs to > determine whether there have been changes made to the directory by > other clients. It does this by using the change attribute as > reported before and after the directory operation in the associated > change_info4 value returned for the operation. and > ... The post-operation change > value needs to be saved as the basis for future change_info4 > comparisons. A good quality client implementation therefore saves the zero cinfo.after value. During a subsequent OPEN operation, it will receive a different non-zero value in the cinfo.before field for that directory, and it will incorrectly believe the directory has changed, triggering an undesirable directory cache invalidation. There are filesystem types where fs_supports_change_attribute() returns false, tmpfs being one. On NFSv4 mounts, this means the fh_getattr() call site in fill_pre_wcc() and fill_post_wcc() is never invoked. Subsequently, nfsd4_change_attribute() is invoked with an uninitialized @stat argument. In fill_pre_wcc(), @stat contains stale stack garbage, which is then placed on the wire. In fill_post_wcc(), ->fh_post_wc is all zeroes, so zero is placed on the wire. Both of these values are meaningless. This fix can be applied immediately to stable kernels. Once there are more regression tests in this area, this optimization can be attempted again. Fixes: 428a23d2bf0c ("nfsd: skip some unnecessary stats in the v4 case") Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/nfsd/nfs3xdr.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 0ee156b9c9d7..48d4f99b7f90 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -487,11 +487,6 @@ neither: return true; } -static bool fs_supports_change_attribute(struct super_block *sb) -{ - return sb->s_flags & SB_I_VERSION || sb->s_export_op->fetch_iversion; -} - /* * Fill in the pre_op attr for the wcc data */ @@ -500,26 +495,24 @@ void fill_pre_wcc(struct svc_fh *fhp) struct inode *inode; struct kstat stat; bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) return; inode = d_inode(fhp->fh_dentry); - if (fs_supports_change_attribute(inode->i_sb) || !v4) { - __be32 err = fh_getattr(fhp, &stat); - - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; - } - fhp->fh_pre_mtime = stat.mtime; - fhp->fh_pre_ctime = stat.ctime; - fhp->fh_pre_size = stat.size; + err = fh_getattr(fhp, &stat); + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; fhp->fh_pre_saved = true; } @@ -530,6 +523,7 @@ void fill_post_wcc(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode = d_inode(fhp->fh_dentry); + __be32 err; if (fhp->fh_no_wcc) return; @@ -537,16 +531,12 @@ void fill_post_wcc(struct svc_fh *fhp) if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); - fhp->fh_post_saved = true; - - if (fs_supports_change_attribute(inode->i_sb) || !v4) { - __be32 err = fh_getattr(fhp, &fhp->fh_post_attr); - - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - } - } + err = fh_getattr(fhp, &fhp->fh_post_attr); + if (err) { + fhp->fh_post_saved = false; + fhp->fh_post_attr.ctime = inode->i_ctime; + } else + fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); -- cgit v1.2.3 From 4425ca3677a6d0b3ecf3b4fa982e67c8c4ece6d7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 5 Jan 2022 14:15:03 -0500 Subject: nfsd: fix crash on COPY_NOTIFY with special stateid [ Upstream commit 074b07d94e0bb6ddce5690a9b7e2373088e8b33a ] RTM says "If the special ONE stateid is passed to nfs4_preprocess_stateid_op(), it returns status=0 but does not set *cstid. nfsd4_copy_notify() depends on stid being set if status=0, and thus can crash if the client sends the right COPY_NOTIFY RPC." RFC 7862 says "The cna_src_stateid MUST refer to either open or locking states provided earlier by the server. If it is invalid, then the operation MUST fail." The RFC doesn't specify an error, and the choice doesn't matter much as this is clearly illegal client behavior, but bad_stateid seems reasonable. Simplest is just to guarantee that nfs4_preprocess_stateid_op, called with non-NULL cstid, errors out if it can't return a stateid. Reported-by: rtm@csail.mit.edu Fixes: 624322f1adc5 ("NFSD add COPY_NOTIFY operation") Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever Reviewed-by: Olga Kornievskaia Tested-by: Olga Kornievskaia Signed-off-by: Sasha Levin --- fs/nfsd/nfs4state.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 97090ddcfc94..db4a47a280dc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6042,7 +6042,11 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, *nfp = NULL; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - status = check_special_stateids(net, fhp, stateid, flags); + if (cstid) + status = nfserr_bad_stateid; + else + status = check_special_stateids(net, fhp, stateid, + flags); goto done; } -- cgit v1.2.3 From 3f20cf3cd43f90d379b014070f0b62249fdbdeb9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Jan 2022 14:08:30 -0800 Subject: hugetlbfs: fix off-by-one error in hugetlb_vmdelete_list() [ Upstream commit d6aba4c8e20d4d2bf65d589953f6d891c178f3a3 ] Pass "end - 1" instead of "end" when walking the interval tree in hugetlb_vmdelete_list() to fix an inclusive vs. exclusive bug. The two callers that pass a non-zero "end" treat it as exclusive, whereas the interval tree iterator expects an inclusive "last". E.g. punching a hole in a file that precisely matches the size of a single hugepage, with a vma starting right on the boundary, will result in unmap_hugepage_range() being called twice, with the second call having start==end. The off-by-one error doesn't cause functional problems as __unmap_hugepage_range() turns into a massive nop due to short-circuiting its for-loop on "address < end". But, the mmu_notifier invocations to invalid_range_{start,end}() are passed a bogus zero-sized range, which may be unexpected behavior for secondary MMUs. The bug was exposed by commit ed922739c919 ("KVM: Use interval tree to do fast hva lookup in memslots"), currently queued in the KVM tree for 5.17, which added a WARN to detect ranges with start==end. Link: https://lkml.kernel.org/r/20211228234257.1926057-1-seanjc@google.com Fixes: 1bfad99ab425 ("hugetlbfs: hugetlb_vmtruncate_list() needs to take a range to delete") Signed-off-by: Sean Christopherson Reported-by: syzbot+4e697fe80a31aa7efe21@syzkaller.appspotmail.com Reviewed-by: Mike Kravetz Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/hugetlbfs/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index cdfb1ae78a3f..54c4e0b0dda4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -409,10 +409,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) struct vm_area_struct *vma; /* - * end == 0 indicates that the entire range after - * start should be unmapped. + * end == 0 indicates that the entire range after start should be + * unmapped. Note, end is exclusive, whereas the interval tree takes + * an inclusive "last". */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsigned long v_offset; unsigned long v_end; -- cgit v1.2.3 From 3d74c2c917e4006a3bd660d2fc7829cb2ef64113 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Mon, 19 Jul 2021 10:54:46 +0000 Subject: cifs: protect session channel fields with chan_lock [ Upstream commit 724244cdb3828522109c88e56a0242537aefabe9 ] Introducing a new spin lock to protect all the channel related fields in a cifs_ses struct. This lock should be taken whenever dealing with the channel fields, and should be held only for very short intervals which will not sleep. Currently, all channel related fields in cifs_ses structure are protected by session_mutex. However, this mutex is held for long periods (sometimes while waiting for a reply from server). This makes the codepath quite tricky to change. Signed-off-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/cifs_debug.c | 2 ++ fs/cifs/cifsglob.h | 5 +++++ fs/cifs/connect.c | 25 ++++++++++++++++++++++--- fs/cifs/misc.c | 1 + fs/cifs/sess.c | 50 ++++++++++++++++++++++++++++++++++++++------------ fs/cifs/transport.c | 3 +++ 6 files changed, 71 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 905a901f7f80..248a8f973cf9 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -414,12 +414,14 @@ skip_rdma: from_kuid(&init_user_ns, ses->linux_uid), from_kuid(&init_user_ns, ses->cred_uid)); + spin_lock(&ses->chan_lock); if (ses->chan_count > 1) { seq_printf(m, "\n\n\tExtra Channels: %zu ", ses->chan_count-1); for (j = 1; j < ses->chan_count; j++) cifs_dump_channel(m, j, &ses->chans[j]); } + spin_unlock(&ses->chan_lock); seq_puts(m, "\n\n\tShares: "); j = 0; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 3e5b8e177cfa..b33835b2943e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -934,16 +934,21 @@ struct cifs_ses { * iface_lock should be taken when accessing any of these fields */ spinlock_t iface_lock; + /* ========= begin: protected by iface_lock ======== */ struct cifs_server_iface *iface_list; size_t iface_count; unsigned long iface_last_update; /* jiffies */ + /* ========= end: protected by iface_lock ======== */ + spinlock_t chan_lock; + /* ========= begin: protected by chan_lock ======== */ #define CIFS_MAX_CHANNELS 16 struct cifs_chan chans[CIFS_MAX_CHANNELS]; struct cifs_chan *binding_chan; size_t chan_count; size_t chan_max; atomic_t chan_seq; /* round robin state */ + /* ========= end: protected by chan_lock ======== */ }; /* diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 439f02f1886c..70da1d27be3d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1526,8 +1526,12 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx) * If an existing session is limited to less channels than * requested, it should not be reused */ - if (ses->chan_max < ctx->max_channels) + spin_lock(&ses->chan_lock); + if (ses->chan_max < ctx->max_channels) { + spin_unlock(&ses->chan_lock); return 0; + } + spin_unlock(&ses->chan_lock); switch (ses->sectype) { case Kerberos: @@ -1662,6 +1666,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) void cifs_put_smb_ses(struct cifs_ses *ses) { unsigned int rc, xid; + unsigned int chan_count; struct TCP_Server_Info *server = ses->server; cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); @@ -1703,12 +1708,24 @@ void cifs_put_smb_ses(struct cifs_ses *ses) list_del_init(&ses->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); + spin_lock(&ses->chan_lock); + chan_count = ses->chan_count; + spin_unlock(&ses->chan_lock); + /* close any extra channels */ - if (ses->chan_count > 1) { + if (chan_count > 1) { int i; - for (i = 1; i < ses->chan_count; i++) + for (i = 1; i < chan_count; i++) { + /* + * note: for now, we're okay accessing ses->chans + * without chan_lock. But when chans can go away, we'll + * need to introduce ref counting to make sure that chan + * is not freed from under us. + */ cifs_put_tcp_session(ses->chans[i].server, 0); + ses->chans[i].server = NULL; + } } sesInfoFree(ses); @@ -1959,9 +1976,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) mutex_lock(&ses->session_mutex); /* add server as first channel */ + spin_lock(&ses->chan_lock); ses->chans[0].server = server; ses->chan_count = 1; ses->chan_max = ctx->multichannel ? ctx->max_channels:1; + spin_unlock(&ses->chan_lock); rc = cifs_negotiate_protocol(xid, ses); if (!rc) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index bb1185fff8cc..0a0d0724c429 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -75,6 +75,7 @@ sesInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->tcon_list); mutex_init(&ret_buf->session_mutex); spin_lock_init(&ret_buf->iface_lock); + spin_lock_init(&ret_buf->chan_lock); } return ret_buf; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 23e02db7923f..a1e688113645 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -54,41 +54,53 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) { int i; + spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { - if (is_server_using_iface(ses->chans[i].server, iface)) + if (is_server_using_iface(ses->chans[i].server, iface)) { + spin_unlock(&ses->chan_lock); return true; + } } + spin_unlock(&ses->chan_lock); return false; } /* returns number of channels added */ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { - int old_chan_count = ses->chan_count; - int left = ses->chan_max - ses->chan_count; + int old_chan_count, new_chan_count; + int left; int i = 0; int rc = 0; int tries = 0; struct cifs_server_iface *ifaces = NULL; size_t iface_count; + if (ses->server->dialect < SMB30_PROT_ID) { + cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); + return 0; + } + + spin_lock(&ses->chan_lock); + + new_chan_count = old_chan_count = ses->chan_count; + left = ses->chan_max - ses->chan_count; + if (left <= 0) { cifs_dbg(FYI, "ses already at max_channels (%zu), nothing to open\n", ses->chan_max); - return 0; - } - - if (ses->server->dialect < SMB30_PROT_ID) { - cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); + spin_unlock(&ses->chan_lock); return 0; } if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); ses->chan_max = 1; + spin_unlock(&ses->chan_lock); return 0; } + spin_unlock(&ses->chan_lock); /* * Make a copy of the iface list at the time and use that @@ -142,10 +154,11 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) cifs_dbg(FYI, "successfully opened new channel on iface#%d\n", i); left--; + new_chan_count++; } kfree(ifaces); - return ses->chan_count - old_chan_count; + return new_chan_count - old_chan_count; } /* @@ -157,10 +170,14 @@ cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server) { int i; + spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { - if (ses->chans[i].server == server) + if (ses->chans[i].server == server) { + spin_unlock(&ses->chan_lock); return &ses->chans[i]; + } } + spin_unlock(&ses->chan_lock); return NULL; } @@ -168,6 +185,7 @@ static int cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, struct cifs_server_iface *iface) { + struct TCP_Server_Info *chan_server; struct cifs_chan *chan; struct smb3_fs_context ctx = {NULL}; static const char unc_fmt[] = "\\%s\\foo"; @@ -240,15 +258,20 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, SMB2_CLIENT_GUID_SIZE); ctx.use_client_guid = true; - mutex_lock(&ses->session_mutex); + chan_server = cifs_get_tcp_session(&ctx); + mutex_lock(&ses->session_mutex); + spin_lock(&ses->chan_lock); chan = ses->binding_chan = &ses->chans[ses->chan_count]; - chan->server = cifs_get_tcp_session(&ctx); + chan->server = chan_server; if (IS_ERR(chan->server)) { rc = PTR_ERR(chan->server); chan->server = NULL; + spin_unlock(&ses->chan_lock); goto out; } + spin_unlock(&ses->chan_lock); + spin_lock(&cifs_tcp_ses_lock); chan->server->is_channel = true; spin_unlock(&cifs_tcp_ses_lock); @@ -283,8 +306,11 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, * ses to the new server. */ + spin_lock(&ses->chan_lock); ses->chan_count++; atomic_set(&ses->chan_seq, 0); + spin_unlock(&ses->chan_lock); + out: ses->binding = false; ses->binding_chan = NULL; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index b7379329b741..61ea3d3f95b4 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1044,14 +1044,17 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!ses) return NULL; + spin_lock(&ses->chan_lock); if (!ses->binding) { /* round robin */ if (ses->chan_count > 1) { index = (uint)atomic_inc_return(&ses->chan_seq); index %= ses->chan_count; } + spin_unlock(&ses->chan_lock); return ses->chans[index].server; } else { + spin_unlock(&ses->chan_lock); return cifs_ses_server(ses); } } -- cgit v1.2.3 From aa280c04da1b4f9aeb5044d3b85540e07c275a33 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 16 Feb 2022 13:23:53 -0600 Subject: cifs: fix confusing unneeded warning message on smb2.1 and earlier [ Upstream commit 53923e0fe2098f90f339510aeaa0e1413ae99a16 ] When mounting with SMB2.1 or earlier, even with nomultichannel, we log the confusing warning message: "CIFS: VFS: multichannel is not supported on this protocol version, use 3.0 or above" Fix this so that we don't log this unless they really are trying to mount with multichannel. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215608 Reported-by: Kim Scarborough Cc: stable@vger.kernel.org # 5.11+ Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/sess.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index a1e688113645..5500ea783784 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -76,11 +76,6 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) struct cifs_server_iface *ifaces = NULL; size_t iface_count; - if (ses->server->dialect < SMB30_PROT_ID) { - cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); - return 0; - } - spin_lock(&ses->chan_lock); new_chan_count = old_chan_count = ses->chan_count; @@ -94,6 +89,12 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) return 0; } + if (ses->server->dialect < SMB30_PROT_ID) { + spin_unlock(&ses->chan_lock); + cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); + return 0; + } + if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); ses->chan_max = 1; -- cgit v1.2.3 From 5afd80c393f4e87451f14eefb7f2f24daf434e06 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 28 Oct 2021 16:03:41 +0100 Subject: btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range commit f0bfa76a11e93d0fe2c896fcb566568c5e8b5d3f upstream. When doing a direct IO write against a file range that either has preallocated extents in that range or has regular extents and the file has the NOCOW attribute set, the write fails with -ENOSPC when all of the following conditions are met: 1) There are no data blocks groups with enough free space matching the size of the write; 2) There's not enough unallocated space for allocating a new data block group; 3) The extents in the target file range are not shared, neither through snapshots nor through reflinks. This is wrong because a NOCOW write can be done in such case, and in fact it's possible to do it using a buffered IO write, since when failing to allocate data space, the buffered IO path checks if a NOCOW write is possible. The failure in direct IO write path comes from the fact that early on, at btrfs_dio_iomap_begin(), we try to allocate data space for the write and if it that fails we return the error and stop - we never check if we can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check if we can do a NOCOW write into the range, or a subset of the range, and then release the previously reserved data space. Fix this by doing the data reservation only if needed, when we must COW, at btrfs_get_blocks_direct_write() instead of doing it at btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes the inneficiency of doing unnecessary data reservations. The following example test script reproduces the problem: $ cat dio-nocow-enospc.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj # Use a small fixed size (1G) filesystem so that it's quick to fill # it up. # Make sure the mixed block groups feature is not enabled because we # later want to not have more space available for allocating data # extents but still have enough metadata space free for the file writes. mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV mount $DEV $MNT # Create our test file with the NOCOW attribute set. touch $MNT/foobar chattr +C $MNT/foobar # Now fill in all unallocated space with data for our test file. # This will allocate a data block group that will be full and leave # no (or a very small amount of) unallocated space in the device, so # that it will not be possible to allocate a new block group later. echo echo "Creating test file with initial data..." xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar # Now try a direct IO write against file range [0, 10M[. # This should succeed since this is a NOCOW file and an extent for the # range was previously allocated. echo echo "Trying direct IO write over allocated space..." xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar umount $MNT When running the test: $ ./dio-nocow-enospc.sh (...) Creating test file with initial data... wrote 943718400/943718400 bytes at offset 0 900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec) Trying direct IO write over allocated space... pwrite: No space left on device A test case for fstests will follow, testing both this direct IO write scenario as well as the buffered IO write scenario to make it less likely to get future regressions on the buffered IO case. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 142 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 78 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e92f0b0afe9e..58053b5f0ce1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -60,8 +60,6 @@ struct btrfs_iget_args { }; struct btrfs_dio_data { - u64 reserve; - loff_t length; ssize_t submitted; struct extent_changeset *data_reserved; }; @@ -7763,6 +7761,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; + int type; + u64 block_start, orig_start, orig_block_len, ram_bytes; + bool can_nocow = false; + bool space_reserved = false; int ret = 0; /* @@ -7777,9 +7779,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) type = BTRFS_ORDERED_PREALLOC; else @@ -7789,53 +7788,92 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes, false) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) { - struct extent_map *em2; + btrfs_inc_nocow_writers(fs_info, block_start)) + can_nocow = true; + } - em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); + if (can_nocow) { + struct extent_map *em2; + + /* We can NOCOW, so only need to reserve metadata space. */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + if (ret < 0) { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; btrfs_dec_nocow_writers(fs_info, block_start); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - *map = em = em2; - } + goto out; + } + space_reserved = true; - if (em2 && IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto out; - } - /* - * For inode marked NODATACOW or extent marked PREALLOC, - * use the existing or preallocated extent, so does not - * need to adjust btrfs_space_info's bytes_may_use. - */ - btrfs_free_reserved_data_space_noquota(fs_info, len); - goto skip_cow; + em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(fs_info, block_start); + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + *map = em = em2; } - } - /* this will cow the extent */ - free_extent_map(em); - *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; + if (IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto out; + } + } else { + const u64 prev_len = len; + + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + + /* We have to COW, so need to reserve metadata and data space. */ + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, len); + if (ret < 0) + goto out; + space_reserved = true; + + em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + *map = em; + len = min(len, em->len - (start - em->start)); + if (len < prev_len) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, + start + len, prev_len - len, + true); } - len = min(len, em->len - (start - em->start)); + /* + * We have created our ordered extent, so we can now release our reservation + * for an outstanding extent. + */ + btrfs_delalloc_release_extents(BTRFS_I(inode), len); -skip_cow: /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - - dio_data->reserve -= len; out: + if (ret && space_reserved) { + btrfs_delalloc_release_extents(BTRFS_I(inode), len); + if (can_nocow) { + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); + } else { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, + start, len, true); + extent_changeset_free(dio_data->data_reserved); + dio_data->data_reserved = NULL; + } + } return ret; } @@ -7877,18 +7915,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (!dio_data) return -ENOMEM; - dio_data->length = length; - if (write) { - dio_data->reserve = round_up(length, fs_info->sectorsize); - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, dio_data->reserve); - if (ret) { - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - return ret; - } - } iomap->private = dio_data; @@ -7981,14 +8007,8 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) { - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, start, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - } + kfree(dio_data); + return ret; } @@ -8018,14 +8038,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ret = -ENOTBLK; } - if (write) { - if (dio_data->reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, pos, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + if (write) extent_changeset_free(dio_data->data_reserved); - } out: kfree(dio_data); iomap->private = NULL; -- cgit v1.2.3 From 5342e9f3dac05b2ba8905cdb78719e2018165926 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 17 Feb 2022 12:12:02 +0000 Subject: btrfs: fix lost prealloc extents beyond eof after full fsync commit d99478874355d3a7b9d86dfb5d7590d5b1754b1f upstream. When doing a full fsync, if we have prealloc extents beyond (or at) eof, and the leaves that contain them were not modified in the current transaction, we end up not logging them. This results in losing those extents when we replay the log after a power failure, since the inode is truncated to the current value of the logged i_size. Just like for the fast fsync path, we need to always log all prealloc extents starting at or beyond i_size. The fast fsync case was fixed in commit 471d557afed155 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay") but it missed the full fsync path. The problem exists since the very early days, when the log tree was added by commit e02119d5a7b439 ("Btrfs: Add a write ahead tree log to optimize synchronous operations"). Example reproducer: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt # Create our test file with many file extent items, so that they span # several leaves of metadata, even if the node/page size is 64K. Use # direct IO and not fsync/O_SYNC because it's both faster and it avoids # clearing the full sync flag from the inode - we want the fsync below # to trigger the slow full sync code path. $ xfs_io -f -d -c "pwrite -b 4K 0 16M" /mnt/foo # Now add two preallocated extents to our file without extending the # file's size. One right at i_size, and another further beyond, leaving # a gap between the two prealloc extents. $ xfs_io -c "falloc -k 16M 1M" /mnt/foo $ xfs_io -c "falloc -k 20M 1M" /mnt/foo # Make sure everything is durably persisted and the transaction is # committed. This makes all created extents to have a generation lower # than the generation of the transaction used by the next write and # fsync. sync # Now overwrite only the first extent, which will result in modifying # only the first leaf of metadata for our inode. Then fsync it. This # fsync will use the slow code path (inode full sync bit is set) because # it's the first fsync since the inode was created/loaded. $ xfs_io -c "pwrite 0 4K" -c "fsync" /mnt/foo # Extent list before power failure. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x0 3: [32768..34815]: 2172928..2174975 2048 0x800 4: [34816..40959]: hole 6144 5: [40960..43007]: 2174976..2177023 2048 0x801 # Mount fs again, trigger log replay. $ mount /dev/sdc /mnt # Extent list after power failure and log replay. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x1 # The prealloc extents at file offsets 16M and 20M are missing. So fix this by calling btrfs_log_prealloc_extents() when we are doing a full fsync, so that we always log all prealloc extents beyond eof. A test case for fstests will follow soon. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b79da8917cbf..211997074534 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4423,7 +4423,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, /* * Log all prealloc extents beyond the inode's i_size to make sure we do not - * lose them after doing a fast fsync and replaying the log. We scan the + * lose them after doing a full/fast fsync and replaying the log. We scan the * subvolume's root instead of iterating the inode's extent map tree because * otherwise we can log incorrect extent items based on extent map conversion. * That can happen due to the fact that extent maps are merged when they @@ -5208,6 +5208,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { + const u64 i_size = i_size_read(&inode->vfs_inode); struct btrfs_root *root = inode->root; int ins_start_slot = 0; int ins_nr = 0; @@ -5228,13 +5229,21 @@ again: if (min_key->type > max_key->type) break; - if (min_key->type == BTRFS_INODE_ITEM_KEY) + if (min_key->type == BTRFS_INODE_ITEM_KEY) { *need_log_inode_item = false; - - if ((min_key->type == BTRFS_INODE_REF_KEY || - min_key->type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && + min_key->offset >= i_size) { + /* + * Extents at and beyond eof are logged with + * btrfs_log_prealloc_extents(). + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, + * and no keys greater than that, so bail out. + */ + break; + } else if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { u64 other_ino = 0; u64 other_parent = 0; @@ -5265,10 +5274,8 @@ again: btrfs_release_path(path); goto next_key; } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, @@ -5321,9 +5328,21 @@ next_key: break; } } - if (ins_nr) + if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); + if (ret) + return ret; + } + + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { + /* + * Release the path because otherwise we might attempt to double + * lock the same leaf with btrfs_log_prealloc_extents() below. + */ + btrfs_release_path(path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + } return ret; } -- cgit v1.2.3 From 725a6ac389b182261af174176e561a36b0f39ffc Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Feb 2022 15:14:43 -0800 Subject: btrfs: fix relocation crash due to premature return from btrfs_commit_transaction() commit 5fd76bf31ccfecc06e2e6b29f8c809e934085b99 upstream. We are seeing crashes similar to the following trace: [38.969182] WARNING: CPU: 20 PID: 2105 at fs/btrfs/relocation.c:4070 btrfs_relocate_block_group+0x2dc/0x340 [btrfs] [38.973556] CPU: 20 PID: 2105 Comm: btrfs Not tainted 5.17.0-rc4 #54 [38.974580] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [38.976539] RIP: 0010:btrfs_relocate_block_group+0x2dc/0x340 [btrfs] [38.980336] RSP: 0000:ffffb0dd42e03c20 EFLAGS: 00010206 [38.981218] RAX: ffff96cfc4ede800 RBX: ffff96cfc3ce0000 RCX: 000000000002ca14 [38.982560] RDX: 0000000000000000 RSI: 4cfd109a0bcb5d7f RDI: ffff96cfc3ce0360 [38.983619] RBP: ffff96cfc309c000 R08: 0000000000000000 R09: 0000000000000000 [38.984678] R10: ffff96cec0000001 R11: ffffe84c80000000 R12: ffff96cfc4ede800 [38.985735] R13: 0000000000000000 R14: 0000000000000000 R15: ffff96cfc3ce0360 [38.987146] FS: 00007f11c15218c0(0000) GS:ffff96d6dfb00000(0000) knlGS:0000000000000000 [38.988662] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [38.989398] CR2: 00007ffc922c8e60 CR3: 00000001147a6001 CR4: 0000000000370ee0 [38.990279] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [38.991219] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [38.992528] Call Trace: [38.992854] [38.993148] btrfs_relocate_chunk+0x27/0xe0 [btrfs] [38.993941] btrfs_balance+0x78e/0xea0 [btrfs] [38.994801] ? vsnprintf+0x33c/0x520 [38.995368] ? __kmalloc_track_caller+0x351/0x440 [38.996198] btrfs_ioctl_balance+0x2b9/0x3a0 [btrfs] [38.997084] btrfs_ioctl+0x11b0/0x2da0 [btrfs] [38.997867] ? mod_objcg_state+0xee/0x340 [38.998552] ? seq_release+0x24/0x30 [38.999184] ? proc_nr_files+0x30/0x30 [38.999654] ? call_rcu+0xc8/0x2f0 [39.000228] ? __x64_sys_ioctl+0x84/0xc0 [39.000872] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [39.001973] __x64_sys_ioctl+0x84/0xc0 [39.002566] do_syscall_64+0x3a/0x80 [39.003011] entry_SYSCALL_64_after_hwframe+0x44/0xae [39.003735] RIP: 0033:0x7f11c166959b [39.007324] RSP: 002b:00007fff2543e998 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [39.008521] RAX: ffffffffffffffda RBX: 00007f11c1521698 RCX: 00007f11c166959b [39.009833] RDX: 00007fff2543ea40 RSI: 00000000c4009420 RDI: 0000000000000003 [39.011270] RBP: 0000000000000003 R08: 0000000000000013 R09: 00007f11c16f94e0 [39.012581] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fff25440df3 [39.014046] R13: 0000000000000000 R14: 00007fff2543ea40 R15: 0000000000000001 [39.015040] [39.015418] ---[ end trace 0000000000000000 ]--- [43.131559] ------------[ cut here ]------------ [43.132234] kernel BUG at fs/btrfs/extent-tree.c:2717! [43.133031] invalid opcode: 0000 [#1] PREEMPT SMP PTI [43.133702] CPU: 1 PID: 1839 Comm: btrfs Tainted: G W 5.17.0-rc4 #54 [43.134863] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [43.136426] RIP: 0010:unpin_extent_range+0x37a/0x4f0 [btrfs] [43.139913] RSP: 0000:ffffb0dd4216bc70 EFLAGS: 00010246 [43.140629] RAX: 0000000000000000 RBX: ffff96cfc34490f8 RCX: 0000000000000001 [43.141604] RDX: 0000000080000001 RSI: 0000000051d00000 RDI: 00000000ffffffff [43.142645] RBP: 0000000000000000 R08: 0000000000000000 R09: ffff96cfd07dca50 [43.143669] R10: ffff96cfc46e8a00 R11: fffffffffffec000 R12: 0000000041d00000 [43.144657] R13: ffff96cfc3ce0000 R14: ffffb0dd4216bd08 R15: 0000000000000000 [43.145686] FS: 00007f7657dd68c0(0000) GS:ffff96d6df640000(0000) knlGS:0000000000000000 [43.146808] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43.147584] CR2: 00007f7fe81bf5b0 CR3: 00000001093ee004 CR4: 0000000000370ee0 [43.148589] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [43.149581] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [43.150559] Call Trace: [43.150904] [43.151253] btrfs_finish_extent_commit+0x88/0x290 [btrfs] [43.152127] btrfs_commit_transaction+0x74f/0xaa0 [btrfs] [43.152932] ? btrfs_attach_transaction_barrier+0x1e/0x50 [btrfs] [43.153786] btrfs_ioctl+0x1edc/0x2da0 [btrfs] [43.154475] ? __check_object_size+0x150/0x170 [43.155170] ? preempt_count_add+0x49/0xa0 [43.155753] ? __x64_sys_ioctl+0x84/0xc0 [43.156437] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [43.157456] __x64_sys_ioctl+0x84/0xc0 [43.157980] do_syscall_64+0x3a/0x80 [43.158543] entry_SYSCALL_64_after_hwframe+0x44/0xae [43.159231] RIP: 0033:0x7f7657f1e59b [43.161819] RSP: 002b:00007ffda5cd1658 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [43.162702] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f7657f1e59b [43.163526] RDX: 0000000000000000 RSI: 0000000000009408 RDI: 0000000000000003 [43.164358] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 [43.165208] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [43.166029] R13: 00005621b91c3232 R14: 00005621b91ba580 R15: 00007ffda5cd1800 [43.166863] [43.167125] Modules linked in: btrfs blake2b_generic xor pata_acpi ata_piix libata raid6_pq scsi_mod libcrc32c virtio_net virtio_rng net_failover rng_core failover scsi_common [43.169552] ---[ end trace 0000000000000000 ]--- [43.171226] RIP: 0010:unpin_extent_range+0x37a/0x4f0 [btrfs] [43.174767] RSP: 0000:ffffb0dd4216bc70 EFLAGS: 00010246 [43.175600] RAX: 0000000000000000 RBX: ffff96cfc34490f8 RCX: 0000000000000001 [43.176468] RDX: 0000000080000001 RSI: 0000000051d00000 RDI: 00000000ffffffff [43.177357] RBP: 0000000000000000 R08: 0000000000000000 R09: ffff96cfd07dca50 [43.178271] R10: ffff96cfc46e8a00 R11: fffffffffffec000 R12: 0000000041d00000 [43.179178] R13: ffff96cfc3ce0000 R14: ffffb0dd4216bd08 R15: 0000000000000000 [43.180071] FS: 00007f7657dd68c0(0000) GS:ffff96d6df800000(0000) knlGS:0000000000000000 [43.181073] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43.181808] CR2: 00007fe09905f010 CR3: 00000001093ee004 CR4: 0000000000370ee0 [43.182706] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [43.183591] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 We first hit the WARN_ON(rc->block_group->pinned > 0) in btrfs_relocate_block_group() and then the BUG_ON(!cache) in unpin_extent_range(). This tells us that we are exiting relocation and removing the block group with bytes still pinned for that block group. This is supposed to be impossible: the last thing relocate_block_group() does is commit the transaction to get rid of pinned extents. Commit d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit") introduced an optimization so that commits from fsync don't have to wait for the previous commit to unpin extents. This was only intended to affect fsync, but it inadvertently made it possible for any commit to skip waiting for the previous commit to unpin. This is because if a call to btrfs_commit_transaction() finds that another thread is already committing the transaction, it waits for the other thread to complete the commit and then returns. If that other thread was in fsync, then it completes the commit without completing the previous commit. This makes the following sequence of events possible: Thread 1____________________|Thread 2 (fsync)_____________________|Thread 3 (balance)___________________ btrfs_commit_transaction(N) | | btrfs_run_delayed_refs | | pin extents | | ... | | state = UNBLOCKED |btrfs_sync_file | | btrfs_start_transaction(N + 1) |relocate_block_group | | btrfs_join_transaction(N + 1) | btrfs_commit_transaction(N + 1) | ... | trans->state = COMMIT_START | | | btrfs_commit_transaction(N + 1) | | wait_for_commit(N + 1, COMPLETED) | wait_for_commit(N, SUPER_COMMITTED)| state = SUPER_COMMITTED | ... | btrfs_finish_extent_commit| | unpin_extent_range() | trans->state = COMPLETED | | | return | | ... | |Thread 1 isn't done, so pinned > 0 | |and we WARN | | | |btrfs_remove_block_group unpin_extent_range() | | Thread 3 removed the | | block group, so we BUG| | There are other sequences involving SUPER_COMMITTED transactions that can cause a similar outcome. We could fix this by making relocation explicitly wait for unpinning, but there may be other cases that need it. Josef mentioned ENOSPC flushing and the free space cache inode as other potential victims. Rather than playing whack-a-mole, this fix is conservative and makes all commits not in fsync wait for all previous transactions, which is what the optimization intended. Fixes: d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Omar Sandoval Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/transaction.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e3e9c58ea66f..3c5b1f72129a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -846,7 +846,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state >= min_state); + struct btrfs_fs_info *fs_info = commit->fs_info; + u64 transid = commit->transid; + bool put = false; + + while (1) { + wait_event(commit->commit_wait, commit->state >= min_state); + if (put) + btrfs_put_transaction(commit); + + if (min_state < TRANS_STATE_COMPLETED) + break; + + /* + * A transaction isn't really completed until all of the + * previous transactions are completed, but with fsync we can + * end up with SUPER_COMMITTED transactions before a COMPLETED + * transaction. Wait for those. + */ + + spin_lock(&fs_info->trans_lock); + commit = list_first_entry_or_null(&fs_info->trans_list, + struct btrfs_transaction, + list); + if (!commit || commit->transid > transid) { + spin_unlock(&fs_info->trans_lock); + break; + } + refcount_inc(&commit->use_count); + put = true; + spin_unlock(&fs_info->trans_lock); + } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) -- cgit v1.2.3 From e00077aa439f0e8f416699fa4e9600db6583db70 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Feb 2022 10:17:39 -0500 Subject: btrfs: do not WARN_ON() if we have PageError set commit a50e1fcbc9b85fd4e95b89a75c0884cb032a3e06 upstream. Whenever we do any extent buffer operations we call assert_eb_page_uptodate() to complain loudly if we're operating on an non-uptodate page. Our overnight tests caught this warning earlier this week WARNING: CPU: 1 PID: 553508 at fs/btrfs/extent_io.c:6849 assert_eb_page_uptodate+0x3f/0x50 CPU: 1 PID: 553508 Comm: kworker/u4:13 Tainted: G W 5.17.0-rc3+ #564 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Workqueue: btrfs-cache btrfs_work_helper RIP: 0010:assert_eb_page_uptodate+0x3f/0x50 RSP: 0018:ffffa961440a7c68 EFLAGS: 00010246 RAX: 0017ffffc0002112 RBX: ffffe6e74453f9c0 RCX: 0000000000001000 RDX: ffffe6e74467c887 RSI: ffffe6e74453f9c0 RDI: ffff8d4c5efc2fc0 RBP: 0000000000000d56 R08: ffff8d4d4a224000 R09: 0000000000000000 R10: 00015817fa9d1ef0 R11: 000000000000000c R12: 00000000000007b1 R13: ffff8d4c5efc2fc0 R14: 0000000001500000 R15: 0000000001cb1000 FS: 0000000000000000(0000) GS:ffff8d4dbbd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff31d3448d8 CR3: 0000000118be8004 CR4: 0000000000370ee0 Call Trace: extent_buffer_test_bit+0x3f/0x70 free_space_test_bit+0xa6/0xc0 load_free_space_tree+0x1f6/0x470 caching_thread+0x454/0x630 ? rcu_read_lock_sched_held+0x12/0x60 ? rcu_read_lock_sched_held+0x12/0x60 ? rcu_read_lock_sched_held+0x12/0x60 ? lock_release+0x1f0/0x2d0 btrfs_work_helper+0xf2/0x3e0 ? lock_release+0x1f0/0x2d0 ? finish_task_switch.isra.0+0xf9/0x3a0 process_one_work+0x26d/0x580 ? process_one_work+0x580/0x580 worker_thread+0x55/0x3b0 ? process_one_work+0x580/0x580 kthread+0xf0/0x120 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 This was partially fixed by c2e39305299f01 ("btrfs: clear extent buffer uptodate when we fail to write it"), however all that fix did was keep us from finding extent buffers after a failed writeout. It didn't keep us from continuing to use a buffer that we already had found. In this case we're searching the commit root to cache the block group, so we can start committing the transaction and switch the commit root and then start writing. After the switch we can look up an extent buffer that hasn't been written yet and start processing that block group. Then we fail to write that block out and clear Uptodate on the page, and then we start spewing these errors. Normally we're protected by the tree lock to a certain degree here. If we read a block we have that block read locked, and we block the writer from locking the block before we submit it for the write. However this isn't necessarily fool proof because the read could happen before we do the submit_bio and after we locked and unlocked the extent buffer. Also in this particular case we have path->skip_locking set, so that won't save us here. We'll simply get a block that was valid when we read it, but became invalid while we were using it. What we really want is to catch the case where we've "read" a block but it's not marked Uptodate. On read we ClearPageError(), so if we're !Uptodate and !Error we know we didn't do the right thing for reading the page. Fix this by checking !Uptodate && !Error, this way we will not complain if our buffer gets invalidated while we're using it, and we'll maintain the spirit of the check which is to make sure we have a fully in-cache block while we're messing with it. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 250fd3c146ac..50a5cc4fd25b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6801,14 +6801,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; + /* + * If we are using the commit root we could potentially clear a page + * Uptodate while we're using the extent buffer that we've previously + * looked up. We don't want to complain in this case, as the page was + * valid before, we just didn't write it out. Instead we want to catch + * the case where we didn't actually read the block properly, which + * would have !PageUptodate && !PageError, as we clear PageError before + * reading. + */ if (fs_info->sectorsize < PAGE_SIZE) { - bool uptodate; + bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate); + error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); + WARN_ON(!uptodate && !error); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!PageUptodate(page) && !PageError(page)); } } -- cgit v1.2.3 From 34146bbadcdd36e696e63d4e38ba1fd677423a85 Mon Sep 17 00:00:00 2001 From: Sidong Yang Date: Mon, 28 Feb 2022 01:43:40 +0000 Subject: btrfs: qgroup: fix deadlock between rescan worker and remove qgroup commit d4aef1e122d8bbdc15ce3bd0bc813d6b44a7d63a upstream. The commit e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker") by Kawasaki resolves deadlock between quota disable and qgroup rescan worker. But also there is a deadlock case like it. It's about enabling or disabling quota and creating or removing qgroup. It can be reproduced in simple script below. for i in {1..100} do btrfs quota enable /mnt & btrfs qgroup create 1/0 /mnt & btrfs qgroup destroy 1/0 /mnt & btrfs quota disable /mnt & done Here's why the deadlock happens: 1) The quota rescan task is running. 2) Task A calls btrfs_quota_disable(), locks the qgroup_ioctl_lock mutex, and then calls btrfs_qgroup_wait_for_completion(), to wait for the quota rescan task to complete. 3) Task B calls btrfs_remove_qgroup() and it blocks when trying to lock the qgroup_ioctl_lock mutex, because it's being held by task A. At that point task B is holding a transaction handle for the current transaction. 4) The quota rescan task calls btrfs_commit_transaction(). This results in it waiting for all other tasks to release their handles on the transaction, but task B is blocked on the qgroup_ioctl_lock mutex while holding a handle on the transaction, and that mutex is being held by task A, which is waiting for the quota rescan task to complete, resulting in a deadlock between these 3 tasks. To resolve this issue, the thread disabling quota should unlock qgroup_ioctl_lock before waiting rescan completion. Move btrfs_qgroup_wait_for_completion() after unlock of qgroup_ioctl_lock. Fixes: e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana Reviewed-by: Shin'ichiro Kawasaki Signed-off-by: Sidong Yang Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/qgroup.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 24fb17d5b28f..2c803108ea94 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1196,6 +1196,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) if (!fs_info->quota_root) goto out; + /* + * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to + * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs + * to lock that mutex while holding a transaction handle and the rescan + * worker needs to commit a transaction. + */ + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* * Request qgroup rescan worker to complete and wait for it. This wait * must be done before transaction start for quota disable since it may @@ -1203,7 +1211,6 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) */ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); - mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * 1 For the root item -- cgit v1.2.3 From 4aef4c9005974c063b5f6557da2ffa437dfdb3d3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 28 Feb 2022 16:29:28 +0000 Subject: btrfs: add missing run of delayed items after unlink during log replay commit 4751dc99627e4d1465c5bfa8cb7ab31ed418eff5 upstream. During log replay, whenever we need to check if a name (dentry) exists in a directory we do searches on the subvolume tree for inode references or or directory entries (BTRFS_DIR_INDEX_KEY keys, and BTRFS_DIR_ITEM_KEY keys as well, before kernel 5.17). However when during log replay we unlink a name, through btrfs_unlink_inode(), we may not delete inode references and dir index keys from a subvolume tree and instead just add the deletions to the delayed inode's delayed items, which will only be run when we commit the transaction used for log replay. This means that after an unlink operation during log replay, if we attempt to search for the same name during log replay, we will not see that the name was already deleted, since the deletion is recorded only on the delayed items. We run delayed items after every unlink operation during log replay, except at unlink_old_inode_refs() and at add_inode_ref(). This was due to an overlook, as delayed items should be run after evert unlink, for the reasons stated above. So fix those two cases. Fixes: 0d836392cadd5 ("Btrfs: fix mount failure after fsync due to hard link recreation") Fixes: 1f250e929a9c9 ("Btrfs: fix log replay failure after unlink and link combination") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 211997074534..8ef65073ce8c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1329,6 +1329,15 @@ again: inode, name, namelen); kfree(name); iput(dir); + /* + * Whenever we need to check if a name exists or not, we + * check the subvolume tree. So after an unlink we must + * run delayed items, so that future checks for a name + * during log replay see that the name does not exists + * anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); if (ret) goto out; goto again; @@ -1580,6 +1589,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ if (!ret && inode->i_nlink == 0) inc_nlink(inode); + /* + * Whenever we need to check if a name exists or + * not, we check the subvolume tree. So after an + * unlink we must run delayed items, so that future + * checks for a name during log replay see that the + * name does not exists anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); } if (ret < 0) goto out; -- cgit v1.2.3 From 6599d5e8bd758d897fd2ef4dc388ae50278b1f7e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Feb 2022 14:56:10 -0500 Subject: btrfs: do not start relocation until in progress drops are done commit b4be6aefa73c9a6899ef3ba9c5faaa8a66e333ef upstream. We hit a bug with a recovering relocation on mount for one of our file systems in production. I reproduced this locally by injecting errors into snapshot delete with balance running at the same time. This presented as an error while looking up an extent item WARNING: CPU: 5 PID: 1501 at fs/btrfs/extent-tree.c:866 lookup_inline_extent_backref+0x647/0x680 CPU: 5 PID: 1501 Comm: btrfs-balance Not tainted 5.16.0-rc8+ #8 RIP: 0010:lookup_inline_extent_backref+0x647/0x680 RSP: 0018:ffffae0a023ab960 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000000000000c RDI: 0000000000000000 RBP: ffff943fd2a39b60 R08: 0000000000000000 R09: 0000000000000001 R10: 0001434088152de0 R11: 0000000000000000 R12: 0000000001d05000 R13: ffff943fd2a39b60 R14: ffff943fdb96f2a0 R15: ffff9442fc923000 FS: 0000000000000000(0000) GS:ffff944e9eb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1157b1fca8 CR3: 000000010f092000 CR4: 0000000000350ee0 Call Trace: insert_inline_extent_backref+0x46/0xd0 __btrfs_inc_extent_ref.isra.0+0x5f/0x200 ? btrfs_merge_delayed_refs+0x164/0x190 __btrfs_run_delayed_refs+0x561/0xfa0 ? btrfs_search_slot+0x7b4/0xb30 ? btrfs_update_root+0x1a9/0x2c0 btrfs_run_delayed_refs+0x73/0x1f0 ? btrfs_update_root+0x1a9/0x2c0 btrfs_commit_transaction+0x50/0xa50 ? btrfs_update_reloc_root+0x122/0x220 prepare_to_merge+0x29f/0x320 relocate_block_group+0x2b8/0x550 btrfs_relocate_block_group+0x1a6/0x350 btrfs_relocate_chunk+0x27/0xe0 btrfs_balance+0x777/0xe60 balance_kthread+0x35/0x50 ? btrfs_balance+0xe60/0xe60 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Normally snapshot deletion and relocation are excluded from running at the same time by the fs_info->cleaner_mutex. However if we had a pending balance waiting to get the ->cleaner_mutex, and a snapshot deletion was running, and then the box crashed, we would come up in a state where we have a half deleted snapshot. Again, in the normal case the snapshot deletion needs to complete before relocation can start, but in this case relocation could very well start before the snapshot deletion completes, as we simply add the root to the dead roots list and wait for the next time the cleaner runs to clean up the snapshot. Fix this by setting a bit on the fs_info if we have any DEAD_ROOT's that had a pending drop_progress key. If they do then we know we were in the middle of the drop operation and set a flag on the fs_info. Then balance can wait until this flag is cleared to start up again. If there are DEAD_ROOT's that don't have a drop_progress set then we're safe to start balance right away as we'll be properly protected by the cleaner_mutex. CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.h | 10 ++++++++++ fs/btrfs/disk-io.c | 10 ++++++++++ fs/btrfs/extent-tree.c | 10 ++++++++++ fs/btrfs/relocation.c | 13 +++++++++++++ fs/btrfs/root-tree.c | 15 +++++++++++++++ fs/btrfs/transaction.c | 33 ++++++++++++++++++++++++++++++++- fs/btrfs/transaction.h | 1 + 7 files changed, 91 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ae06ad559353..b46409801647 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -593,6 +593,9 @@ enum { /* Indicate whether there are any tree modification log users */ BTRFS_FS_TREE_MOD_LOG_USERS, + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -1098,8 +1101,15 @@ enum { BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, + /* This root has a drop operation that was started previously. */ + BTRFS_ROOT_UNFINISHED_DROP, }; +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2c3e106a0270..2180fcef56ca 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3659,6 +3659,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device set_bit(BTRFS_FS_OPEN, &fs_info->flags); + /* Kick the cleaner thread so it'll start deleting snapshots. */ + if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); + clear_oneshot: btrfs_clear_oneshot_options(fs_info); return 0; @@ -4340,6 +4344,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); + /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + */ + btrfs_wake_unfinished_drop(fs_info); + /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 87c23c5c0f26..514adc83577f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5541,6 +5541,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) int ret; int level; bool root_dropped = false; + bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); @@ -5583,6 +5584,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -5757,6 +5760,13 @@ out_free: kfree(wc); btrfs_free_path(path); out: + /* + * We were an unfinished drop root, check to see if there are any + * pending, and if not clear and wake up any waiters. + */ + if (!err && unfinished_drop) + btrfs_maybe_wake_unfinished_drop(fs_info); + /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d81bee621d37..a050f9748fa7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3967,6 +3967,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + /* + * This only gets set if we had a half-deleted snapshot on mount. We + * cannot allow relocation to start while we're still trying to clean up + * these pending deletions. + */ + ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); + if (ret) + return ret; + + /* We may have been woken up by close_ctree, so bail if we're closing. */ + if (btrfs_fs_closing(fs_info)) + return -EINTR; + bg = btrfs_lookup_block_group(fs_info, group_start); if (!bg) return -ENOENT; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index db37a3799649..1fa0e5e2e350 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -280,6 +280,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); if (btrfs_root_refs(&root->root_item) == 0) { + struct btrfs_key drop_key; + + btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress); + /* + * If we have a non-zero drop_progress then we know we + * made it partly through deleting this snapshot, and + * thus we need to make sure we block any balance from + * happening until this snapshot is completely dropped. + */ + if (drop_key.objectid != 0 || drop_key.type != 0 || + drop_key.offset != 0) { + set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); + set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + } + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3c5b1f72129a..9a6009108ea5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1340,6 +1340,32 @@ again: return 0; } +/* + * If we had a pending drop we need to see if there are any others left in our + * dead roots list, and if not clear our bit and wake any waiters. + */ +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + /* + * We put the drop in progress roots at the front of the list, so if the + * first entry doesn't have UNFINISHED_DROP set we can wake everybody + * up. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, + root_list); + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { + spin_unlock(&fs_info->trans_lock); + return; + } + } + spin_unlock(&fs_info->trans_lock); + + btrfs_wake_unfinished_drop(fs_info); +} + /* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to @@ -1352,7 +1378,12 @@ void btrfs_add_dead_root(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); - list_add_tail(&root->root_list, &fs_info->dead_roots); + + /* We want to process the partially complete drops first. */ + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) + list_add(&root->root_list, &fs_info->dead_roots); + else + list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index eba07b8119bb..0ded32bbd001 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -217,6 +217,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); -- cgit v1.2.3 From 416e3a0e4276c01c6da8df2e97b7b7e5abc04f38 Mon Sep 17 00:00:00 2001 From: Yun Zhou Date: Fri, 4 Mar 2022 20:29:07 -0800 Subject: proc: fix documentation and description of pagemap commit dd21bfa425c098b95ca86845f8e7d1ec1ddf6e4a upstream. Since bit 57 was exported for uffd-wp write-protected (commit fb8e37f35a2f: "mm/pagemap: export uffd-wp protection information"), fixing it can reduce some unnecessary confusion. Link: https://lkml.kernel.org/r/20220301044538.3042713-1-yun.zhou@windriver.com Fixes: fb8e37f35a2fe1 ("mm/pagemap: export uffd-wp protection information") Signed-off-by: Yun Zhou Reviewed-by: Peter Xu Cc: Jonathan Corbet Cc: Tiberiu A Georgescu Cc: Florian Schmidt Cc: Ivan Teterevkov Cc: SeongJae Park Cc: Yang Shi Cc: David Hildenbrand Cc: Axel Rasmussen Cc: Miaohe Lin Cc: Andrea Arcangeli Cc: Colin Cross Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- Documentation/admin-guide/mm/pagemap.rst | 2 +- fs/proc/task_mmu.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index fb578fbbb76c..49857ce1cd03 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -23,7 +23,7 @@ There are four components to pagemap: * Bit 56 page exclusively mapped (since 4.2) * Bit 57 pte is uffd-wp write-protected (since 5.13) (see :ref:`Documentation/admin-guide/mm/userfaultfd.rst `) - * Bits 57-60 zero + * Bits 58-60 zero * Bit 61 page is file-page or shared-anon (since 3.5) * Bit 62 page swapped * Bit 63 page present diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 958fce7aee63..79ca4d69dfd6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1582,7 +1582,8 @@ static const struct mm_walk_ops pagemap_ops = { * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped - * Bits 57-60 zero + * Bit 57 pte is uffd-wp write-protected + * Bits 58-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present -- cgit v1.2.3 From d60d34b4d6d1227923bcbe6fdba24c2f03ce69cf Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 18 Feb 2022 11:47:51 +0100 Subject: fuse: fix fileattr op failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a679a61520d8a7b0211a1da990404daf5cc80b72 upstream. The fileattr API conversion broke lsattr on ntfs3g. Previously the ioctl(... FS_IOC_GETFLAGS) returned an EINVAL error, but after the conversion the error returned by the fuse filesystem was not propagated back to the ioctl() system call, resulting in success being returned with bogus values. Fix by checking for outarg.result in fuse_priv_ioctl(), just as generic ioctl code does. Reported-by: Jean-Pierre André Fixes: 72227eac177d ("fuse: convert to fileattr") Cc: # v5.13 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/ioctl.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 546ea3d58fb4..fc69e1797a33 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -394,9 +394,12 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.out_args[1].value = ptr; err = fuse_simple_request(fm, &args); - if (!err && outarg.flags & FUSE_IOCTL_RETRY) - err = -EIO; - + if (!err) { + if (outarg.result < 0) + err = outarg.result; + else if (outarg.flags & FUSE_IOCTL_RETRY) + err = -EIO; + } return err; } -- cgit v1.2.3 From ca62747b38f59d4e75967ebf63c992de8852ca1b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Mar 2022 16:30:44 +0100 Subject: fuse: fix pipe buffer lifetime for direct_io commit 0c4bcfdecb1ac0967619ee7ff44871d93c08c909 upstream. In FOPEN_DIRECT_IO mode, fuse_file_write_iter() calls fuse_direct_write_iter(), which normally calls fuse_direct_io(), which then imports the write buffer with fuse_get_user_pages(), which uses iov_iter_get_pages() to grab references to userspace pages instead of actually copying memory. On the filesystem device side, these pages can then either be read to userspace (via fuse_dev_read()), or splice()d over into a pipe using fuse_dev_splice_read() as pipe buffers with &nosteal_pipe_buf_ops. This is wrong because after fuse_dev_do_read() unlocks the FUSE request, the userspace filesystem can mark the request as completed, causing write() to return. At that point, the userspace filesystem should no longer have access to the pipe buffer. Fix by copying pages coming from the user address space to new pipe buffers. Reported-by: Jann Horn Fixes: c3021629a0d8 ("fuse: support splice() reading from fuse device") Cc: Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 12 +++++++++++- fs/fuse/file.c | 1 + fs/fuse/fuse_i.h | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a9d21b33da9c..d6b5339c56e2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -941,7 +941,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->args->user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2004d362361e..bc50a9fa84a0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1417,6 +1417,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.user_pages = true; if (write) ap->args.in_pages = true; else diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a59e36c7deae..c3a87586a15f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -256,6 +256,7 @@ struct fuse_args { bool nocreds:1; bool in_pages:1; bool out_pages:1; + bool user_pages:1; bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; -- cgit v1.2.3 From 8275b6699c6dda4d4cec748bc2e879cac532d193 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:23:38 +0000 Subject: watch_queue, pipe: Free watchqueue state after clearing pipe ring commit db8facfc9fafacefe8a835416a6b77c838088f8b upstream. In free_pipe_info(), free the watchqueue state after clearing the pipe ring as each pipe ring descriptor has a release function, and in the case of a notification message, this is watch_queue_pipe_buf_release() which tries to mark the allocation bitmap that was previously released. Fix this by moving the put of the pipe's ref on the watch queue to after the ring has been cleared. We still need to call watch_queue_clear() before doing that to make sure that the pipe is disconnected from any notification sources first. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/pipe.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 6d4342bad9f1..8b0f6979fb49 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -830,10 +830,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) int i; #ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); - put_watch_queue(pipe->watch_queue); - } #endif (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); @@ -843,6 +841,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (buf->ops) pipe_buf_release(pipe, buf); } +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) + put_watch_queue(pipe->watch_queue); +#endif if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); -- cgit v1.2.3 From eb38c2e9fc740b5218dc94d2a6f43802d49e811c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:36 +0000 Subject: watch_queue: Fix lack of barrier/sync/lock between post and read commit 2ed147f015af2b48f41c6f0b6746aa9ea85c19f3 upstream. There's nothing to synchronise post_one_notification() versus pipe_read(). Whilst posting is done under pipe->rd_wait.lock, the reader only takes pipe->mutex which cannot bar notification posting as that may need to be made from contexts that cannot sleep. Fix this by setting pipe->head with a barrier in post_one_notification() and reading pipe->head with a barrier in pipe_read(). If that's not sufficient, the rd_wait.lock will need to be taken, possibly in a ->confirm() op so that it only applies to notifications. The lock would, however, have to be dropped before copy_page_to_iter() is invoked. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/pipe.c | 3 ++- kernel/watch_queue.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 8b0f6979fb49..751d5b36c84b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -252,7 +252,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { - unsigned int head = pipe->head; + /* Read ->head with a barrier vs post_one_notification() */ + unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index e4d7225f539e..1a13066bf6fa 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -113,7 +113,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; - pipe->head = head + 1; + smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); -- cgit v1.2.3 From a1ce40f8aeb19eaedf3743cdca4ec4e09bdce909 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 22 Nov 2021 12:03:38 +0000 Subject: btrfs: make send work with concurrent block group relocation commit d96b34248c2f4ea8cd09286090f2f6f77102eaab upstream. We don't allow send and balance/relocation to run in parallel in order to prevent send failing or silently producing some bad stream. This is because while send is using an extent (specially metadata) or about to read a metadata extent and expecting it belongs to a specific parent node, relocation can run, the transaction used for the relocation is committed and the extent gets reallocated while send is still using the extent, so it ends up with a different content than expected. This can result in just failing to read a metadata extent due to failure of the validation checks (parent transid, level, etc), failure to find a backreference for a data extent, and other unexpected failures. Besides reallocation, there's also a similar problem of an extent getting discarded when it's unpinned after the transaction used for block group relocation is committed. The restriction between balance and send was added in commit 9e967495e0e0 ("Btrfs: prevent send failures and crashes due to concurrent relocation"), kernel 5.3, while the more general restriction between send and relocation was added in commit 1cea5cf0e664 ("btrfs: ensure relocation never runs while we have send operations running"), kernel 5.14. Both send and relocation can be very long running operations. Relocation because it has to do a lot of IO and expensive backreference lookups in case there are many snapshots, and send due to read IO when operating on very large trees. This makes it inconvenient for users and tools to deal with scheduling both operations. For zoned filesystem we also have automatic block group relocation, so send can fail with -EAGAIN when users least expect it or send can end up delaying the block group relocation for too long. In the future we might also get the automatic block group relocation for non zoned filesystems. This change makes it possible for send and relocation to run in parallel. This is achieved the following way: 1) For all tree searches, send acquires a read lock on the commit root semaphore; 2) After each tree search, and before releasing the commit root semaphore, the leaf is cloned and placed in the search path (struct btrfs_path); 3) After releasing the commit root semaphore, the changed_cb() callback is invoked, which operates on the leaf and writes commands to the pipe (or file in case send/receive is not used with a pipe). It's important here to not hold a lock on the commit root semaphore, because if we did we could deadlock when sending and receiving to the same filesystem using a pipe - the send task blocks on the pipe because it's full, the receive task, which is the only consumer of the pipe, triggers a transaction commit when attempting to create a subvolume or reserve space for a write operation for example, but the transaction commit blocks trying to write lock the commit root semaphore, resulting in a deadlock; 4) Before moving to the next key, or advancing to the next change in case of an incremental send, check if a transaction used for relocation was committed (or is about to finish its commit). If so, release the search path(s) and restart the search, to where we were before, so that we don't operate on stale extent buffers. The search restarts are always possible because both the send and parent roots are RO, and no one can add, remove of update keys (change their offset) in RO trees - the only exception is deduplication, but that is still not allowed to run in parallel with send; 5) Periodically check if there is contention on the commit root semaphore, which means there is a transaction commit trying to write lock it, and release the semaphore and reschedule if there is contention, so as to avoid causing any significant delays to transaction commits. This leaves some room for optimizations for send to have less path releases and re searching the trees when there's relocation running, but for now it's kept simple as it performs quite well (on very large trees with resulting send streams in the order of a few hundred gigabytes). Test case btrfs/187, from fstests, stresses relocation, send and deduplication attempting to run in parallel, but without verifying if send succeeds and if it produces correct streams. A new test case will be added that exercises relocation happening in parallel with send and then checks that send succeeds and the resulting streams are correct. A final note is that for now this still leaves the mutual exclusion between send operations and deduplication on files belonging to a root used by send operations. A solution for that will be slightly more complex but it will eventually be built on top of this change. Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/block-group.c | 9 +- fs/btrfs/ctree.c | 98 ++++++++++---- fs/btrfs/ctree.h | 14 +- fs/btrfs/disk-io.c | 4 +- fs/btrfs/relocation.c | 13 -- fs/btrfs/send.c | 357 ++++++++++++++++++++++++++++++++++++++++++------- fs/btrfs/transaction.c | 4 + 7 files changed, 395 insertions(+), 104 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index d721c66d0b41..5edd07e0232d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1491,7 +1491,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; - LIST_HEAD(again_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1562,18 +1561,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret && ret != -EAGAIN) + if (ret) btrfs_err(fs_info, "error relocating chunk %llu", bg->start); next: + btrfs_put_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); - if (ret == -EAGAIN && list_empty(&bg->bg_list)) - list_add_tail(&bg->bg_list, &again_list); - else - btrfs_put_block_group(bg); } - list_splice_tail(&again_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 95a6a63caf04..899f85445925 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1566,32 +1566,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, struct btrfs_path *p, int write_lock_level) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int root_lock = 0; int level = 0; if (p->search_commit_root) { - /* - * The commit roots are read only so we always do read locks, - * and we always must hold the commit_root_sem when doing - * searches on them, the only exception is send where we don't - * want to block transaction commits for a long time, so - * we need to clone the commit root in order to avoid races - * with transaction commits that create a snapshot of one of - * the roots used by a send operation. - */ - if (p->need_commit_sem) { - down_read(&fs_info->commit_root_sem); - b = btrfs_clone_extent_buffer(root->commit_root); - up_read(&fs_info->commit_root_sem); - if (!b) - return ERR_PTR(-ENOMEM); - - } else { - b = root->commit_root; - atomic_inc(&b->refs); - } + b = root->commit_root; + atomic_inc(&b->refs); level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when @@ -1657,6 +1638,42 @@ out: return b; } +/* + * Replace the extent buffer at the lowest level of the path with a cloned + * version. The purpose is to be able to use it safely, after releasing the + * commit root semaphore, even if relocation is happening in parallel, the + * transaction used for relocation is committed and the extent buffer is + * reallocated in the next transaction. + * + * This is used in a context where the caller does not prevent transaction + * commits from happening, either by holding a transaction handle or holding + * some lock, while it's doing searches through a commit root. + * At the moment it's only used for send operations. + */ +static int finish_need_commit_sem_search(struct btrfs_path *path) +{ + const int i = path->lowest_level; + const int slot = path->slots[i]; + struct extent_buffer *lowest = path->nodes[i]; + struct extent_buffer *clone; + + ASSERT(path->need_commit_sem); + + if (!lowest) + return 0; + + lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); + + clone = btrfs_clone_extent_buffer(lowest); + if (!clone) + return -ENOMEM; + + btrfs_release_path(path); + path->nodes[i] = clone; + path->slots[i] = slot; + + return 0; +} /* * btrfs_search_slot - look for a key in a tree and perform necessary @@ -1693,6 +1710,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; @@ -1734,6 +1752,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, min_write_lock_level = write_lock_level; + if (p->need_commit_sem) { + ASSERT(p->search_commit_root); + down_read(&fs_info->commit_root_sem); + } + again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); @@ -1928,6 +1951,16 @@ cow_done: done: if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); + + if (p->need_commit_sem) { + int ret2; + + ret2 = finish_need_commit_sem_search(p); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } + return ret; } ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); @@ -4396,7 +4429,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int level; struct extent_buffer *c; struct extent_buffer *next; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; + bool need_commit_sem = false; u32 nritems; int ret; int i; @@ -4413,14 +4448,20 @@ again: path->keep_locks = 1; - if (time_seq) + if (time_seq) { ret = btrfs_search_old_slot(root, &key, path, time_seq); - else + } else { + if (path->need_commit_sem) { + path->need_commit_sem = 0; + need_commit_sem = true; + down_read(&fs_info->commit_root_sem); + } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + } path->keep_locks = 0; if (ret < 0) - return ret; + goto done; nritems = btrfs_header_nritems(path->nodes[0]); /* @@ -4543,6 +4584,15 @@ again: ret = 0; done: unlock_up(path, 0, 1, 0, NULL); + if (need_commit_sem) { + int ret2; + + path->need_commit_sem = 1; + ret2 = finish_need_commit_sem_search(path); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b46409801647..e89f814cc8f5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -568,7 +568,6 @@ enum { /* * Indicate that relocation of a chunk has started, it's set per chunk * and is toggled between chunks. - * Set, tested and cleared while holding fs_info::send_reloc_lock. */ BTRFS_FS_RELOC_RUNNING, @@ -668,6 +667,12 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; u64 avg_delayed_ref_runtime; /* @@ -997,13 +1002,6 @@ struct btrfs_fs_info { struct crypto_shash *csum_shash; - spinlock_t send_reloc_lock; - /* - * Number of send operations in progress. - * Updated while holding fs_info::send_reloc_lock. - */ - int send_in_progress; - /* Type of exclusive operation running, protected by super_lock */ enum btrfs_exclusive_operation exclusive_operation; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2180fcef56ca..d5a590b11be5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2859,6 +2859,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* All successful */ fs_info->generation = generation; fs_info->last_trans_committed = generation; + fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ if (backup_index < 0) { @@ -2992,9 +2993,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; - spin_lock_init(&fs_info->send_reloc_lock); - fs_info->send_in_progress = 0; - fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a050f9748fa7..a6661f2ad2c0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3854,25 +3854,14 @@ out: * 0 success * -EINPROGRESS operation is already in progress, that's probably a bug * -ECANCELED cancellation request was set before the operation started - * -EAGAIN can not start because there are ongoing send operations */ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) { - spin_lock(&fs_info->send_reloc_lock); - if (fs_info->send_in_progress) { - btrfs_warn_rl(fs_info, -"cannot run relocation while send operations are in progress (%d in progress)", - fs_info->send_in_progress); - spin_unlock(&fs_info->send_reloc_lock); - return -EAGAIN; - } if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { /* This should not happen */ - spin_unlock(&fs_info->send_reloc_lock); btrfs_err(fs_info, "reloc already running, cannot start"); return -EINPROGRESS; } - spin_unlock(&fs_info->send_reloc_lock); if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); @@ -3894,9 +3883,7 @@ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); - spin_lock(&fs_info->send_reloc_lock); clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); - spin_unlock(&fs_info->send_reloc_lock); atomic_set(&fs_info->reloc_cancel_req, 0); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5612e8bf2ace..4d2c6ce29fe5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -24,6 +24,7 @@ #include "transaction.h" #include "compression.h" #include "xattr.h" +#include "print-tree.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -95,6 +96,15 @@ struct send_ctx { struct btrfs_path *right_path; struct btrfs_key *cmp_key; + /* + * Keep track of the generation of the last transaction that was used + * for relocating a block group. This is periodically checked in order + * to detect if a relocation happened since the last check, so that we + * don't operate on stale extent buffers for nodes (level >= 1) or on + * stale disk_bytenr values of file extent items. + */ + u64 last_reloc_trans; + /* * infos of the currently processed inode. In case of deleted inodes, * these are the values from the deleted inode. @@ -1415,6 +1425,26 @@ static int find_extent_clone(struct send_ctx *sctx, if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + /* + * A transaction commit for a transaction in which block group + * relocation was done just happened. + * The disk_bytenr of the file extent item we processed is + * possibly stale, referring to the extent's location before + * relocation. So act as if we haven't found any clone sources + * and fallback to write commands, which will read the correct + * data from the new extent location. Otherwise we will fail + * below because we haven't found our own back reference or we + * could be getting incorrect sources in case the old extent + * was already reallocated after the relocation. + */ + up_read(&fs_info->commit_root_sem); + ret = -ENOENT; + goto out; + } + up_read(&fs_info->commit_root_sem); + if (!backref_ctx.found_itself) { /* found a bug in backref code? */ ret = -EIO; @@ -6596,6 +6626,50 @@ static int changed_cb(struct btrfs_path *left_path, { int ret = 0; + /* + * We can not hold the commit root semaphore here. This is because in + * the case of sending and receiving to the same filesystem, using a + * pipe, could result in a deadlock: + * + * 1) The task running send blocks on the pipe because it's full; + * + * 2) The task running receive, which is the only consumer of the pipe, + * is waiting for a transaction commit (for example due to a space + * reservation when doing a write or triggering a transaction commit + * when creating a subvolume); + * + * 3) The transaction is waiting to write lock the commit root semaphore, + * but can not acquire it since it's being held at 1). + * + * Down this call chain we write to the pipe through kernel_write(). + * The same type of problem can also happen when sending to a file that + * is stored in the same filesystem - when reserving space for a write + * into the file, we can trigger a transaction commit. + * + * Our caller has supplied us with clones of leaves from the send and + * parent roots, so we're safe here from a concurrent relocation and + * further reallocation of metadata extents while we are here. Below we + * also assert that the leaves are clones. + */ + lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem); + + /* + * We always have a send root, so left_path is never NULL. We will not + * have a leaf when we have reached the end of the send root but have + * not yet reached the end of the parent root. + */ + if (left_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &left_path->nodes[0]->bflags)); + /* + * When doing a full send we don't have a parent root, so right_path is + * NULL. When doing an incremental send, we may have reached the end of + * the parent root already, so we don't have a leaf at right_path. + */ + if (right_path && right_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &right_path->nodes[0]->bflags)); + if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || key->type == BTRFS_INODE_EXTREF_KEY) { @@ -6642,14 +6716,46 @@ out: return ret; } +static int search_key_again(const struct send_ctx *sctx, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key) +{ + int ret; + + if (!path->need_commit_sem) + lockdep_assert_held_read(&root->fs_info->commit_root_sem); + + /* + * Roots used for send operations are readonly and no one can add, + * update or remove keys from them, so we should be able to find our + * key again. The only exception is deduplication, which can operate on + * readonly roots and add, update or remove keys to/from them - but at + * the moment we don't allow it to run in parallel with send. + */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + ASSERT(ret <= 0); + if (ret > 0) { + btrfs_print_tree(path->nodes[path->lowest_level], false); + btrfs_err(root->fs_info, +"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", + key->objectid, key->type, key->offset, + (root == sctx->parent_root ? "parent" : "send"), + root->root_key.objectid, path->lowest_level, + path->slots[path->lowest_level]); + return -EUCLEAN; + } + + return ret; +} + static int full_send_tree(struct send_ctx *sctx) { int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; + struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_path *path; - struct extent_buffer *eb; - int slot; path = alloc_path_for_send(); if (!path) @@ -6660,6 +6766,10 @@ static int full_send_tree(struct send_ctx *sctx) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; + down_read(&fs_info->commit_root_sem); + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -6667,15 +6777,35 @@ static int full_send_tree(struct send_ctx *sctx) goto out_finish; while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &key, slot); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + /* + * A transaction used for relocating a block group was + * committed or is about to finish its commit. Release + * our path (leaf) and restart the search, so that we + * avoid operating on any file extent items that are + * stale, with a disk_bytenr that reflects a pre + * relocation value. This way we avoid as much as + * possible to fallback to regular writes when checking + * if we can clone file ranges. + */ + btrfs_release_path(path); + ret = search_key_again(sctx, send_root, path, &key); + if (ret < 0) + goto out; + } else { + up_read(&fs_info->commit_root_sem); + } + ret = btrfs_next_item(send_root, path); if (ret < 0) goto out; @@ -6693,6 +6823,20 @@ out: return ret; } +static int replace_node_with_clone(struct btrfs_path *path, int level) +{ + struct extent_buffer *clone; + + clone = btrfs_clone_extent_buffer(path->nodes[level]); + if (!clone) + return -ENOMEM; + + free_extent_buffer(path->nodes[level]); + path->nodes[level] = clone; + + return 0; +} + static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) { struct extent_buffer *eb; @@ -6702,6 +6846,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen u64 reada_max; u64 reada_done = 0; + lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + BUG_ON(*level == 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) @@ -6725,6 +6871,10 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + + if (*level == 0) + return replace_node_with_clone(path, 0); + return 0; } @@ -6738,8 +6888,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, path->slots[*level]++; while (path->slots[*level] >= nritems) { - if (*level == root_level) + if (*level == root_level) { + path->slots[*level] = nritems - 1; return -1; + } /* move upnext */ path->slots[*level] = 0; @@ -6771,14 +6923,20 @@ static int tree_advance(struct btrfs_path *path, } else { ret = tree_move_down(path, level, reada_min_gen); } - if (ret >= 0) { - if (*level == 0) - btrfs_item_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - else - btrfs_node_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - } + + /* + * Even if we have reached the end of a tree, ret is -1, update the key + * anyway, so that in case we need to restart due to a block group + * relocation, we can assert that the last key of the root node still + * exists in the tree. + */ + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + return ret; } @@ -6807,6 +6965,97 @@ static int tree_compare_item(struct btrfs_path *left_path, return 0; } +/* + * A transaction used for relocating a block group was committed or is about to + * finish its commit. Release our paths and restart the search, so that we are + * not using stale extent buffers: + * + * 1) For levels > 0, we are only holding references of extent buffers, without + * any locks on them, which does not prevent them from having been relocated + * and reallocated after the last time we released the commit root semaphore. + * The exception are the root nodes, for which we always have a clone, see + * the comment at btrfs_compare_trees(); + * + * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so + * we are safe from the concurrent relocation and reallocation. However they + * can have file extent items with a pre relocation disk_bytenr value, so we + * restart the start from the current commit roots and clone the new leaves so + * that we get the post relocation disk_bytenr values. Not doing so, could + * make us clone the wrong data in case there are new extents using the old + * disk_bytenr that happen to be shared. + */ +static int restart_after_relocation(struct btrfs_path *left_path, + struct btrfs_path *right_path, + const struct btrfs_key *left_key, + const struct btrfs_key *right_key, + int left_level, + int right_level, + const struct send_ctx *sctx) +{ + int root_level; + int ret; + + lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem); + + btrfs_release_path(left_path); + btrfs_release_path(right_path); + + /* + * Since keys can not be added or removed to/from our roots because they + * are readonly and we do not allow deduplication to run in parallel + * (which can add, remove or change keys), the layout of the trees should + * not change. + */ + left_path->lowest_level = left_level; + ret = search_key_again(sctx, sctx->send_root, left_path, left_key); + if (ret < 0) + return ret; + + right_path->lowest_level = right_level; + ret = search_key_again(sctx, sctx->parent_root, right_path, right_key); + if (ret < 0) + return ret; + + /* + * If the lowest level nodes are leaves, clone them so that they can be + * safely used by changed_cb() while not under the protection of the + * commit root semaphore, even if relocation and reallocation happens in + * parallel. + */ + if (left_level == 0) { + ret = replace_node_with_clone(left_path, 0); + if (ret < 0) + return ret; + } + + if (right_level == 0) { + ret = replace_node_with_clone(right_path, 0); + if (ret < 0) + return ret; + } + + /* + * Now clone the root nodes (unless they happen to be the leaves we have + * already cloned). This is to protect against concurrent snapshotting of + * the send and parent roots (see the comment at btrfs_compare_trees()). + */ + root_level = btrfs_header_level(sctx->send_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(left_path, root_level); + if (ret < 0) + return ret; + } + + root_level = btrfs_header_level(sctx->parent_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(right_path, root_level); + if (ret < 0) + return ret; + } + + return 0; +} + /* * This function compares two trees and calls the provided callback for * every changed/new/deleted item it finds. @@ -6835,10 +7084,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, int right_root_level; int left_level; int right_level; - int left_end_reached; - int right_end_reached; - int advance_left; - int advance_right; + int left_end_reached = 0; + int right_end_reached = 0; + int advance_left = 0; + int advance_right = 0; u64 left_blockptr; u64 right_blockptr; u64 left_gen; @@ -6906,12 +7155,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; + /* + * We clone the root node of the send and parent roots to prevent races + * with snapshot creation of these roots. Snapshot creation COWs the + * root node of a tree, so after the transaction is committed the old + * extent can be reallocated while this send operation is still ongoing. + * So we clone them, under the commit root semaphore, to be race free. + */ left_path->nodes[left_level] = btrfs_clone_extent_buffer(left_root->commit_root); if (!left_path->nodes[left_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } right_level = btrfs_header_level(right_root->commit_root); @@ -6919,9 +7174,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, right_path->nodes[right_level] = btrfs_clone_extent_buffer(right_root->commit_root); if (!right_path->nodes[right_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } /* * Our right root is the parent root, while the left root is the "send" @@ -6931,7 +7185,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, * will need to read them at some point. */ reada_min_gen = btrfs_header_generation(right_root->commit_root); - up_read(&fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -6946,11 +7199,26 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, btrfs_node_key_to_cpu(right_path->nodes[right_level], &right_key, right_path->slots[right_level]); - left_end_reached = right_end_reached = 0; - advance_left = advance_right = 0; + sctx->last_reloc_trans = fs_info->last_reloc_trans; while (1) { - cond_resched(); + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + up_read(&fs_info->commit_root_sem); + cond_resched(); + down_read(&fs_info->commit_root_sem); + } + + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + ret = restart_after_relocation(left_path, right_path, + &left_key, &right_key, + left_level, right_level, + sctx); + if (ret < 0) + goto out_unlock; + sctx->last_reloc_trans = fs_info->last_reloc_trans; + } + if (advance_left && !left_end_reached) { ret = tree_advance(left_path, &left_level, left_root_level, @@ -6959,7 +7227,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) left_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -6970,54 +7238,55 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) right_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_right = 0; } if (left_end_reached && right_end_reached) { ret = 0; - goto out; + goto out_unlock; } else if (left_end_reached) { if (right_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_right = ADVANCE; continue; } else if (right_end_reached) { if (left_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_left = ADVANCE; continue; } if (left_level == 0 && right_level == 0) { + up_read(&fs_info->commit_root_sem); cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; } else if (cmp > 0) { ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); - if (ret < 0) - goto out; advance_right = ADVANCE; } else { enum btrfs_compare_tree_result result; @@ -7031,11 +7300,13 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, &left_key, result, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; advance_right = ADVANCE; } + + if (ret < 0) + goto out; + down_read(&fs_info->commit_root_sem); } else if (left_level == right_level) { cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { @@ -7075,6 +7346,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, } } +out_unlock: + up_read(&fs_info->commit_root_sem); out: btrfs_free_path(left_path); btrfs_free_path(right_path); @@ -7413,21 +7686,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; - spin_lock(&fs_info->send_reloc_lock); - if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { - spin_unlock(&fs_info->send_reloc_lock); - btrfs_warn_rl(fs_info, - "cannot run send because a relocation operation is in progress"); - ret = -EAGAIN; - goto out; - } - fs_info->send_in_progress++; - spin_unlock(&fs_info->send_reloc_lock); - ret = send_subvol(sctx); - spin_lock(&fs_info->send_reloc_lock); - fs_info->send_in_progress--; - spin_unlock(&fs_info->send_reloc_lock); if (ret < 0) goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9a6009108ea5..642cd2b55fa0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -163,6 +163,10 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_caching_control *caching_ctl, *next; down_write(&fs_info->commit_root_sem); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + fs_info->last_reloc_trans = trans->transid; + list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); -- cgit v1.2.3 From b786b64dcb312922f9d671b8bd75d3dec5ed9a53 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 16 Mar 2022 16:15:09 -0700 Subject: ocfs2: fix crash when initialize filecheck kobj fails commit 7b0b1332cfdb94489836b67d088a779699f8e47e upstream. Once s_root is set, genric_shutdown_super() will be called if fill_super() fails. That means, we will call ocfs2_dismount_volume() twice in such case, which can lead to kernel crash. Fix this issue by initializing filecheck kobj before setting s_root. Link: https://lkml.kernel.org/r/20220310081930.86305-1-joseph.qi@linux.alibaba.com Fixes: 5f483c4abb50 ("ocfs2: add kobject for online file check") Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/super.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 5c914ce9b3ac..7ba3dabe16f0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1106,17 +1106,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } - root = d_make_root(inode); - if (!root) { - status = -ENOMEM; - mlog_errno(status); - goto read_super_error; - } - - sb->s_root = root; - - ocfs2_complete_mount_recovery(osb); - osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); if (!osb->osb_dev_kset) { @@ -1134,6 +1123,17 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } + root = d_make_root(inode); + if (!root) { + status = -ENOMEM; + mlog_errno(status); + goto read_super_error; + } + + sb->s_root = root; + + ocfs2_complete_mount_recovery(osb); + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else -- cgit v1.2.3 From 4c5d94990fa2fd609360ecd0f7e183212a7d115c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 18 Jan 2022 13:39:34 +0000 Subject: btrfs: skip reserved bytes warning on unmount after log cleanup failure commit 40cdc509877bacb438213b83c7541c5e24a1d9ec upstream. After the recent changes made by commit c2e39305299f01 ("btrfs: clear extent buffer uptodate when we fail to write it") and its followup fix, commit 651740a5024117 ("btrfs: check WRITE_ERR when trying to read an extent buffer"), we can now end up not cleaning up space reservations of log tree extent buffers after a transaction abort happens, as well as not cleaning up still dirty extent buffers. This happens because if writeback for a log tree extent buffer failed, then we have cleared the bit EXTENT_BUFFER_UPTODATE from the extent buffer and we have also set the bit EXTENT_BUFFER_WRITE_ERR on it. Later on, when trying to free the log tree with free_log_tree(), which iterates over the tree, we can end up getting an -EIO error when trying to read a node or a leaf, since read_extent_buffer_pages() returns -EIO if an extent buffer does not have EXTENT_BUFFER_UPTODATE set and has the EXTENT_BUFFER_WRITE_ERR bit set. Getting that -EIO means that we return immediately as we can not iterate over the entire tree. In that case we never update the reserved space for an extent buffer in the respective block group and space_info object. When this happens we get the following traces when unmounting the fs: [174957.284509] BTRFS: error (device dm-0) in cleanup_transaction:1913: errno=-5 IO failure [174957.286497] BTRFS: error (device dm-0) in free_log_tree:3420: errno=-5 IO failure [174957.399379] ------------[ cut here ]------------ [174957.402497] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:127 btrfs_put_block_group+0x77/0xb0 [btrfs] [174957.407523] Modules linked in: btrfs overlay dm_zero (...) [174957.424917] CPU: 2 PID: 3206883 Comm: umount Tainted: G W 5.16.0-rc5-btrfs-next-109 #1 [174957.426689] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [174957.428716] RIP: 0010:btrfs_put_block_group+0x77/0xb0 [btrfs] [174957.429717] Code: 21 48 8b bd (...) [174957.432867] RSP: 0018:ffffb70d41cffdd0 EFLAGS: 00010206 [174957.433632] RAX: 0000000000000001 RBX: ffff8b09c3848000 RCX: ffff8b0758edd1c8 [174957.434689] RDX: 0000000000000001 RSI: ffffffffc0b467e7 RDI: ffff8b0758edd000 [174957.436068] RBP: ffff8b0758edd000 R08: 0000000000000000 R09: 0000000000000000 [174957.437114] R10: 0000000000000246 R11: 0000000000000000 R12: ffff8b09c3848148 [174957.438140] R13: ffff8b09c3848198 R14: ffff8b0758edd188 R15: dead000000000100 [174957.439317] FS: 00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000 [174957.440402] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [174957.441164] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0 [174957.442117] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [174957.443076] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [174957.443948] Call Trace: [174957.444264] [174957.444538] btrfs_free_block_groups+0x255/0x3c0 [btrfs] [174957.445238] close_ctree+0x301/0x357 [btrfs] [174957.445803] ? call_rcu+0x16c/0x290 [174957.446250] generic_shutdown_super+0x74/0x120 [174957.446832] kill_anon_super+0x14/0x30 [174957.447305] btrfs_kill_super+0x12/0x20 [btrfs] [174957.447890] deactivate_locked_super+0x31/0xa0 [174957.448440] cleanup_mnt+0x147/0x1c0 [174957.448888] task_work_run+0x5c/0xa0 [174957.449336] exit_to_user_mode_prepare+0x1e5/0x1f0 [174957.449934] syscall_exit_to_user_mode+0x16/0x40 [174957.450512] do_syscall_64+0x48/0xc0 [174957.450980] entry_SYSCALL_64_after_hwframe+0x44/0xae [174957.451605] RIP: 0033:0x7f328fdc4a97 [174957.452059] Code: 03 0c 00 f7 (...) [174957.454320] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [174957.455262] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97 [174957.456131] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0 [174957.457118] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40 [174957.458005] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000 [174957.459113] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000 [174957.460193] [174957.460534] irq event stamp: 0 [174957.461003] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [174957.461947] hardirqs last disabled at (0): [] copy_process+0x934/0x2040 [174957.463147] softirqs last enabled at (0): [] copy_process+0x934/0x2040 [174957.465116] softirqs last disabled at (0): [<0000000000000000>] 0x0 [174957.466323] ---[ end trace bc7ee0c490bce3af ]--- [174957.467282] ------------[ cut here ]------------ [174957.468184] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:3976 btrfs_free_block_groups+0x330/0x3c0 [btrfs] [174957.470066] Modules linked in: btrfs overlay dm_zero (...) [174957.483137] CPU: 2 PID: 3206883 Comm: umount Tainted: G W 5.16.0-rc5-btrfs-next-109 #1 [174957.484691] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [174957.486853] RIP: 0010:btrfs_free_block_groups+0x330/0x3c0 [btrfs] [174957.488050] Code: 00 00 00 ad de (...) [174957.491479] RSP: 0018:ffffb70d41cffde0 EFLAGS: 00010206 [174957.492520] RAX: ffff8b08d79310b0 RBX: ffff8b09c3848000 RCX: 0000000000000000 [174957.493868] RDX: 0000000000000001 RSI: fffff443055ee600 RDI: ffffffffb1131846 [174957.495183] RBP: ffff8b08d79310b0 R08: 0000000000000000 R09: 0000000000000000 [174957.496580] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8b08d7931000 [174957.498027] R13: ffff8b09c38492b0 R14: dead000000000122 R15: dead000000000100 [174957.499438] FS: 00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000 [174957.500990] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [174957.502117] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0 [174957.503513] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [174957.504864] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [174957.506167] Call Trace: [174957.506654] [174957.507047] close_ctree+0x301/0x357 [btrfs] [174957.507867] ? call_rcu+0x16c/0x290 [174957.508567] generic_shutdown_super+0x74/0x120 [174957.509447] kill_anon_super+0x14/0x30 [174957.510194] btrfs_kill_super+0x12/0x20 [btrfs] [174957.511123] deactivate_locked_super+0x31/0xa0 [174957.511976] cleanup_mnt+0x147/0x1c0 [174957.512610] task_work_run+0x5c/0xa0 [174957.513309] exit_to_user_mode_prepare+0x1e5/0x1f0 [174957.514231] syscall_exit_to_user_mode+0x16/0x40 [174957.515069] do_syscall_64+0x48/0xc0 [174957.515718] entry_SYSCALL_64_after_hwframe+0x44/0xae [174957.516688] RIP: 0033:0x7f328fdc4a97 [174957.517413] Code: 03 0c 00 f7 d8 (...) [174957.521052] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [174957.522514] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97 [174957.523950] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0 [174957.525375] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40 [174957.526763] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000 [174957.528058] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000 [174957.529404] [174957.529843] irq event stamp: 0 [174957.530256] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [174957.531061] hardirqs last disabled at (0): [] copy_process+0x934/0x2040 [174957.532075] softirqs last enabled at (0): [] copy_process+0x934/0x2040 [174957.533083] softirqs last disabled at (0): [<0000000000000000>] 0x0 [174957.533865] ---[ end trace bc7ee0c490bce3b0 ]--- [174957.534452] BTRFS info (device dm-0): space_info 4 has 1070841856 free, is not full [174957.535404] BTRFS info (device dm-0): space_info total=1073741824, used=2785280, pinned=0, reserved=49152, may_use=0, readonly=65536 zone_unusable=0 [174957.537029] BTRFS info (device dm-0): global_block_rsv: size 0 reserved 0 [174957.537859] BTRFS info (device dm-0): trans_block_rsv: size 0 reserved 0 [174957.538697] BTRFS info (device dm-0): chunk_block_rsv: size 0 reserved 0 [174957.539552] BTRFS info (device dm-0): delayed_block_rsv: size 0 reserved 0 [174957.540403] BTRFS info (device dm-0): delayed_refs_rsv: size 0 reserved 0 This also means that in case we have log tree extent buffers that are still dirty, we can end up not cleaning them up in case we find an extent buffer with EXTENT_BUFFER_WRITE_ERR set on it, as in that case we have no way for iterating over the rest of the tree. This issue is very often triggered with test cases generic/475 and generic/648 from fstests. The issue could almost be fixed by iterating over the io tree attached to each log root which keeps tracks of the range of allocated extent buffers, log_root->dirty_log_pages, however that does not work and has some inconveniences: 1) After we sync the log, we clear the range of the extent buffers from the io tree, so we can't find them after writeback. We could keep the ranges in the io tree, with a separate bit to signal they represent extent buffers already written, but that means we need to hold into more memory until the transaction commits. How much more memory is used depends a lot on whether we are able to allocate contiguous extent buffers on disk (and how often) for a log tree - if we are able to, then a single extent state record can represent multiple extent buffers, otherwise we need multiple extent state record structures to track each extent buffer. In fact, my earlier approach did that: https://lore.kernel.org/linux-btrfs/3aae7c6728257c7ce2279d6660ee2797e5e34bbd.1641300250.git.fdmanana@suse.com/ However that can cause a very significant negative impact on performance, not only due to the extra memory usage but also because we get a larger and deeper dirty_log_pages io tree. We got a report that, on beefy machines at least, we can get such performance drop with fsmark for example: https://lore.kernel.org/linux-btrfs/20220117082426.GE32491@xsang-OptiPlex-9020/ 2) We would be doing it only to deal with an unexpected and exceptional case, which is basically failure to read an extent buffer from disk due to IO failures. On a healthy system we don't expect transaction aborts to happen after all; 3) Instead of relying on iterating the log tree or tracking the ranges of extent buffers in the dirty_log_pages io tree, using the radix tree that tracks extent buffers (fs_info->buffer_radix) to find all log tree extent buffers is not reliable either, because after writeback of an extent buffer it can be evicted from memory by the release page callback of the btree inode (btree_releasepage()). Since there's no way to be able to properly cleanup a log tree without being able to read its extent buffers from disk and without using more memory to track the logical ranges of the allocated extent buffers do the following: 1) When we fail to cleanup a log tree, setup a flag that indicates that failure; 2) Trigger writeback of all log tree extent buffers that are still dirty, and wait for the writeback to complete. This is just to cleanup their state, page states, page leaks, etc; 3) When unmounting the fs, ignore if the number of bytes reserved in a block group and in a space_info is not 0 if, and only if, we failed to cleanup a log tree. Also ignore only for metadata block groups and the metadata space_info object. This is far from a perfect solution, but it serves to silence test failures such as those from generic/475 and generic/648. However having a non-zero value for the reserved bytes counters on unmount after a transaction abort, is not such a terrible thing and it's completely harmless, it does not affect the filesystem integrity in any way. Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/block-group.c | 26 ++++++++++++++++++++++++-- fs/btrfs/ctree.h | 7 +++++++ fs/btrfs/tree-log.c | 23 +++++++++++++++++++++++ 3 files changed, 54 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5edd07e0232d..e1c5c2114edf 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -123,7 +123,16 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) { if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on reserved > 0 in that + * case. + */ + if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) + WARN_ON(cache->reserved > 0); /* * A block_group shouldn't be on the discard_list anymore. @@ -3888,9 +3897,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) * important and indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on bytes_reserved > 0 in + * that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + WARN_ON(space_info->reclaim_size > 0); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e89f814cc8f5..21c44846b002 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -142,6 +142,9 @@ enum { BTRFS_FS_STATE_DEV_REPLACING, /* The btrfs_fs_info created for self-tests */ BTRFS_FS_STATE_DUMMY_FS_INFO, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, }; #define BTRFS_BACKREF_REV_MAX 256 @@ -3578,6 +3581,10 @@ do { \ (errno), fmt, ##args); \ } while (0) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) + __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8ef65073ce8c..e90d80a8a9e3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3423,6 +3423,29 @@ static void free_log_tree(struct btrfs_trans_handle *trans, if (log->node) { ret = walk_log_tree(trans, log, &wc); if (ret) { + /* + * We weren't able to traverse the entire log tree, the + * typical scenario is getting an -EIO when reading an + * extent buffer of the tree, due to a previous writeback + * failure of it. + */ + set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + &log->fs_info->fs_state); + + /* + * Some extent buffers of the log tree may still be dirty + * and not yet written back to storage, because we may + * have updates to a log tree without syncing a log tree, + * such as during rename and link operations. So flush + * them out and wait for their writeback to complete, so + * that we properly cleanup their state and pages. + */ + btrfs_write_marked_extents(log->fs_info, + &log->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + btrfs_wait_tree_log_extents(log, + EXTENT_DIRTY | EXTENT_NEW); + if (trans) btrfs_abort_transaction(trans, ret); else -- cgit v1.2.3 From 2fafbc198613823943c106d1ec9b516da692059f Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 4 Mar 2022 10:31:49 +1000 Subject: cifs: fix handlecache and multiuser commit 47178c7722ac528ea08aa82c3ef9ffa178962d7a upstream. In multiuser each individual user has their own tcon structure for the share and thus their own handle for a cached directory. When we umount such a share we much make sure to release the pinned down dentry for each such tcon and not just the master tcon. Otherwise we will get nasty warnings on umount that dentries are still in use: [ 3459.590047] BUG: Dentry 00000000115c6f41{i=12000000019d95,n=/} still in use\ (2) [unmount of cifs cifs] ... [ 3459.590492] Call Trace: [ 3459.590500] d_walk+0x61/0x2a0 [ 3459.590518] ? shrink_lock_dentry.part.0+0xe0/0xe0 [ 3459.590526] shrink_dcache_for_umount+0x49/0x110 [ 3459.590535] generic_shutdown_super+0x1a/0x110 [ 3459.590542] kill_anon_super+0x14/0x30 [ 3459.590549] cifs_kill_sb+0xf5/0x104 [cifs] [ 3459.590773] deactivate_locked_super+0x36/0xa0 [ 3459.590782] cleanup_mnt+0x131/0x190 [ 3459.590789] task_work_run+0x5c/0x90 [ 3459.590798] exit_to_user_mode_loop+0x151/0x160 [ 3459.590809] exit_to_user_mode_prepare+0x83/0xd0 [ 3459.590818] syscall_exit_to_user_mode+0x12/0x30 [ 3459.590828] do_syscall_64+0x48/0x90 [ 3459.590833] entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Ronnie Sahlberg Acked-by: Paulo Alcantara (SUSE) Cc: stable@vger.kernel.org Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsfs.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 21bf82fc2278..e56ac76be146 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -254,6 +254,9 @@ static void cifs_kill_sb(struct super_block *sb) struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct cached_fid *cfid; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node; + struct tcon_link *tlink; /* * We ned to release all dentries for the cached directories @@ -263,17 +266,21 @@ static void cifs_kill_sb(struct super_block *sb) dput(cifs_sb->root); cifs_sb->root = NULL; } - tcon = cifs_sb_master_tcon(cifs_sb); - if (tcon) { + spin_lock(&cifs_sb->tlink_tree_lock); + node = rb_first(root); + while (node != NULL) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + tcon = tlink_tcon(tlink); cfid = &tcon->crfid; mutex_lock(&cfid->fid_mutex); if (cfid->dentry) { - dput(cfid->dentry); cfid->dentry = NULL; } mutex_unlock(&cfid->fid_mutex); + node = rb_next(node); } + spin_unlock(&cifs_sb->tlink_tree_lock); kill_anon_super(sb); cifs_umount(cifs_sb); -- cgit v1.2.3 From 512bde6420870f5403e253866d8de7a2267275bb Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 15 Mar 2022 13:44:04 +1000 Subject: cifs: we do not need a spinlock around the tree access during umount commit 9a14b65d590105d393b63f5320e1594edda7c672 upstream. Remove the spinlock around the tree traversal as we are calling possibly sleeping functions. We do not need a spinlock here as there will be no modifications to this tree at this point. This prevents warnings like this to occur in dmesg: [ 653.774996] BUG: sleeping function called from invalid context at kernel/loc\ king/mutex.c:280 [ 653.775088] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1827, nam\ e: umount [ 653.775152] preempt_count: 1, expected: 0 [ 653.775191] CPU: 0 PID: 1827 Comm: umount Tainted: G W OE 5.17.0\ -rc7-00006-g4eb628dd74df #135 [ 653.775195] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-\ 1.fc33 04/01/2014 [ 653.775197] Call Trace: [ 653.775199] [ 653.775202] dump_stack_lvl+0x34/0x44 [ 653.775209] __might_resched.cold+0x13f/0x172 [ 653.775213] mutex_lock+0x75/0xf0 [ 653.775217] ? __mutex_lock_slowpath+0x10/0x10 [ 653.775220] ? _raw_write_lock_irq+0xd0/0xd0 [ 653.775224] ? dput+0x6b/0x360 [ 653.775228] cifs_kill_sb+0xff/0x1d0 [cifs] [ 653.775285] deactivate_locked_super+0x85/0x130 [ 653.775289] cleanup_mnt+0x32c/0x4d0 [ 653.775292] ? path_umount+0x228/0x380 [ 653.775296] task_work_run+0xd8/0x180 [ 653.775301] exit_to_user_mode_loop+0x152/0x160 [ 653.775306] exit_to_user_mode_prepare+0x89/0xd0 [ 653.775315] syscall_exit_to_user_mode+0x12/0x30 [ 653.775322] do_syscall_64+0x48/0x90 [ 653.775326] entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 187af6e98b44e5d8f25e1d41a92db138eb54416f ("cifs: fix handlecache and multiuser") Reported-by: kernel test robot Cc: stable@vger.kernel.org Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsfs.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e56ac76be146..22a1d8156220 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -266,7 +266,6 @@ static void cifs_kill_sb(struct super_block *sb) dput(cifs_sb->root); cifs_sb->root = NULL; } - spin_lock(&cifs_sb->tlink_tree_lock); node = rb_first(root); while (node != NULL) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); @@ -280,7 +279,6 @@ static void cifs_kill_sb(struct super_block *sb) mutex_unlock(&cfid->fid_mutex); node = rb_next(node); } - spin_unlock(&cifs_sb->tlink_tree_lock); kill_anon_super(sb); cifs_umount(cifs_sb); -- cgit v1.2.3 From 253a9533941ef89156955ea9f24e09e182450ba9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Feb 2022 18:20:38 -0500 Subject: NFS: NFSv2/v3 clients should never be setting NFS_CAP_XATTR commit b622ffe1d9ecbac71f0cddb52ff0831efdf8fb83 upstream. Ensure that we always initialise the 'xattr_support' field in struct nfs_fsinfo, so that nfs_server_set_fsinfo() doesn't declare our NFSv2/v3 client to be capable of supporting the NFSv4.2 xattr protocol by setting the NFS_CAP_XATTR capability. This configuration can cause nfs_do_access() to set access mode bits that are unsupported by the NFSv3 ACCESS call, which may confuse spec-compliant servers. Reported-by: Olga Kornievskaia Fixes: b78ef845c35d ("NFSv4.2: query the server for extended attribute support") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs3xdr.c | 1 + fs/nfs/proc.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 9274c9c5efea..54a1d21cbcc6 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -2228,6 +2228,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, /* ignore properties */ result->lease_time = 0; result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + result->xattr_support = 0; return 0; } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ecc4e717808c..a5b0bdcb5396 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -92,6 +92,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, info->maxfilesize = 0x7FFFFFFF; info->lease_time = 0; info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + info->xattr_support = 0; return 0; } -- cgit v1.2.3 From 614a61e1592051cc42d3c38f899c9f7bdaad8a1d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 15 Mar 2022 13:30:09 +0300 Subject: NFSD: prevent underflow in nfssvc_decode_writeargs() commit 184416d4b98509fb4c3d8fc3d6dc1437896cc159 upstream. Smatch complains: fs/nfsd/nfsxdr.c:341 nfssvc_decode_writeargs() warn: no lower bound on 'args->len' Change the type to unsigned to prevent this issue. Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfsproc.c | 2 +- fs/nfsd/xdr.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 312fd289be58..9700ad433b49 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -230,7 +230,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) unsigned long cnt = argp->len; unsigned int nvecs; - dprintk("nfsd: WRITE %s %d bytes at %d\n", + dprintk("nfsd: WRITE %s %u bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 80fd6d7f3404..863a35f24910 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -32,7 +32,7 @@ struct nfsd_readargs { struct nfsd_writeargs { svc_fh fh; __u32 offset; - int len; + __u32 len; struct xdr_buf payload; }; -- cgit v1.2.3 From e98ae961b3344d931f66558b5796f8bc00b54aba Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Feb 2022 11:21:14 +0800 Subject: f2fs: fix to unlock page correctly in error path of is_alive() commit 6d18762ed5cd549fde74fd0e05d4d87bac5a3beb upstream. As Pavel Machek reported in below link [1]: After commit 77900c45ee5c ("f2fs: fix to do sanity check in is_alive()"), node page should be unlock via calling f2fs_put_page() in the error path of is_alive(), otherwise, f2fs may hang when it tries to lock the node page, fix it. [1] https://lore.kernel.org/stable/20220124203637.GA19321@duo.ucw.cz/ Fixes: 77900c45ee5c ("f2fs: fix to do sanity check in is_alive()") Cc: Reported-by: Pavel Machek Signed-off-by: Pavel Machek Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/gc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 264821df0add..ceb5fc6b4039 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1023,8 +1023,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } - if (f2fs_check_nid_range(sbi, dni->ino)) + if (f2fs_check_nid_range(sbi, dni->ino)) { + f2fs_put_page(node_page, 1); return false; + } *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); -- cgit v1.2.3 From f9156db0987f1b426015d56505e2c58dee70c90d Mon Sep 17 00:00:00 2001 From: Juhyung Park Date: Tue, 15 Feb 2022 17:27:21 +0900 Subject: f2fs: quota: fix loop condition at f2fs_quota_sync() commit 680af5b824a52faa819167628665804a14f0e0df upstream. cnt should be passed to sb_has_quota_active() instead of type to check active quota properly. Moreover, when the type is -1, the compiler with enough inline knowledge can discard sb_has_quota_active() check altogether, causing a NULL pointer dereference at the following inode_lock(dqopt->files[cnt]): [ 2.796010] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000a0 [ 2.796024] Mem abort info: [ 2.796025] ESR = 0x96000005 [ 2.796028] EC = 0x25: DABT (current EL), IL = 32 bits [ 2.796029] SET = 0, FnV = 0 [ 2.796031] EA = 0, S1PTW = 0 [ 2.796032] Data abort info: [ 2.796034] ISV = 0, ISS = 0x00000005 [ 2.796035] CM = 0, WnR = 0 [ 2.796046] user pgtable: 4k pages, 39-bit VAs, pgdp=00000003370d1000 [ 2.796048] [00000000000000a0] pgd=0000000000000000, pud=0000000000000000 [ 2.796051] Internal error: Oops: 96000005 [#1] PREEMPT SMP [ 2.796056] CPU: 7 PID: 640 Comm: f2fs_ckpt-259:7 Tainted: G S 5.4.179-arter97-r8-64666-g2f16e087f9d8 #1 [ 2.796057] Hardware name: Qualcomm Technologies, Inc. Lahaina MTP lemonadep (DT) [ 2.796059] pstate: 80c00005 (Nzcv daif +PAN +UAO) [ 2.796065] pc : down_write+0x28/0x70 [ 2.796070] lr : f2fs_quota_sync+0x100/0x294 [ 2.796071] sp : ffffffa3f48ffc30 [ 2.796073] x29: ffffffa3f48ffc30 x28: 0000000000000000 [ 2.796075] x27: ffffffa3f6d718b8 x26: ffffffa415fe9d80 [ 2.796077] x25: ffffffa3f7290048 x24: 0000000000000001 [ 2.796078] x23: 0000000000000000 x22: ffffffa3f7290000 [ 2.796080] x21: ffffffa3f72904a0 x20: ffffffa3f7290110 [ 2.796081] x19: ffffffa3f77a9800 x18: ffffffc020aae038 [ 2.796083] x17: ffffffa40e38e040 x16: ffffffa40e38e6d0 [ 2.796085] x15: ffffffa40e38e6cc x14: ffffffa40e38e6d0 [ 2.796086] x13: 00000000000004f6 x12: 00162c44ff493000 [ 2.796088] x11: 0000000000000400 x10: ffffffa40e38c948 [ 2.796090] x9 : 0000000000000000 x8 : 00000000000000a0 [ 2.796091] x7 : 0000000000000000 x6 : 0000d1060f00002a [ 2.796093] x5 : ffffffa3f48ff718 x4 : 000000000000000d [ 2.796094] x3 : 00000000060c0000 x2 : 0000000000000001 [ 2.796096] x1 : 0000000000000000 x0 : 00000000000000a0 [ 2.796098] Call trace: [ 2.796100] down_write+0x28/0x70 [ 2.796102] f2fs_quota_sync+0x100/0x294 [ 2.796104] block_operations+0x120/0x204 [ 2.796106] f2fs_write_checkpoint+0x11c/0x520 [ 2.796107] __checkpoint_and_complete_reqs+0x7c/0xd34 [ 2.796109] issue_checkpoint_thread+0x6c/0xb8 [ 2.796112] kthread+0x138/0x414 [ 2.796114] ret_from_fork+0x10/0x18 [ 2.796117] Code: aa0803e0 aa1f03e1 52800022 aa0103e9 (c8e97d02) [ 2.796120] ---[ end trace 96e942e8eb6a0b53 ]--- [ 2.800116] Kernel panic - not syncing: Fatal exception [ 2.800120] SMP: stopping secondary CPUs Fixes: 9de71ede81e6 ("f2fs: quota: fix potential deadlock") Cc: # v5.15+ Signed-off-by: Juhyung Park Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6174c4f4cee7..7b744ceb17a5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2676,7 +2676,7 @@ int f2fs_quota_sync(struct super_block *sb, int type) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); int cnt; - int ret; + int ret = 0; /* * Now when everything is written we can discard the pagecache so @@ -2687,8 +2687,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (type != -1 && cnt != type) continue; - if (!sb_has_quota_active(sb, type)) - return 0; + if (!sb_has_quota_active(sb, cnt)) + continue; inode_lock(dqopt->files[cnt]); -- cgit v1.2.3 From b065f398c8602af2014015ac978526351140a047 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 21 Mar 2022 23:22:11 +0800 Subject: f2fs: fix to do sanity check on .cp_pack_total_block_count commit 5b5b4f85b01604389f7a0f11ef180a725bf0e2d4 upstream. As bughunter reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215709 f2fs may hang when mounting a fuzzed image, the dmesg shows as below: __filemap_get_folio+0x3a9/0x590 pagecache_get_page+0x18/0x60 __get_meta_page+0x95/0x460 [f2fs] get_checkpoint_version+0x2a/0x1e0 [f2fs] validate_checkpoint+0x8e/0x2a0 [f2fs] f2fs_get_valid_checkpoint+0xd0/0x620 [f2fs] f2fs_fill_super+0xc01/0x1d40 [f2fs] mount_bdev+0x18a/0x1c0 f2fs_mount+0x15/0x20 [f2fs] legacy_get_tree+0x28/0x50 vfs_get_tree+0x27/0xc0 path_mount+0x480/0xaa0 do_mount+0x7c/0xa0 __x64_sys_mount+0x8b/0xe0 do_syscall_64+0x38/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is cp_pack_total_block_count field in checkpoint was fuzzed to one, as calcuated, two cp pack block locates in the same block address, so then read latter cp pack block, it will block on the page lock due to the lock has already held when reading previous cp pack block, fix it by adding sanity check for cp_pack_total_block_count. Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/checkpoint.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 99fced979718..1ff32926e199 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -867,6 +867,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, struct page *cp_page_1 = NULL, *cp_page_2 = NULL; struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; + unsigned int cp_blocks; int err; err = get_checkpoint_version(sbi, cp_addr, &cp_block, @@ -874,15 +875,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (err) return NULL; - if (le32_to_cpu(cp_block->cp_pack_total_block_count) > - sbi->blocks_per_seg) { + cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); + + if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) { f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); goto invalid_cp; } pre_version = *version; - cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; + cp_addr += cp_blocks - 1; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) -- cgit v1.2.3 From 7a75740206af5f17e9f3efa384211cba70213da1 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 28 Dec 2021 20:54:30 +0800 Subject: jffs2: fix use-after-free in jffs2_clear_xattr_subsystem commit 4c7c44ee1650677fbe89d86edbad9497b7679b5c upstream. When we mount a jffs2 image, assume that the first few blocks of the image are normal and contain at least one xattr-related inode, but the next block is abnormal. As a result, an error is returned in jffs2_scan_eraseblock(). jffs2_clear_xattr_subsystem() is then called in jffs2_build_filesystem() and then again in jffs2_do_fill_super(). Finally we can observe the following report: ================================================================== BUG: KASAN: use-after-free in jffs2_clear_xattr_subsystem+0x95/0x6ac Read of size 8 at addr ffff8881243384e0 by task mount/719 Call Trace: dump_stack+0x115/0x16b jffs2_clear_xattr_subsystem+0x95/0x6ac jffs2_do_fill_super+0x84f/0xc30 jffs2_fill_super+0x2ea/0x4c0 mtd_get_sb+0x254/0x400 mtd_get_sb_by_nr+0x4f/0xd0 get_tree_mtd+0x498/0x840 jffs2_get_tree+0x25/0x30 vfs_get_tree+0x8d/0x2e0 path_mount+0x50f/0x1e50 do_mount+0x107/0x130 __se_sys_mount+0x1c5/0x2f0 __x64_sys_mount+0xc7/0x160 do_syscall_64+0x45/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Allocated by task 719: kasan_save_stack+0x23/0x60 __kasan_kmalloc.constprop.0+0x10b/0x120 kasan_slab_alloc+0x12/0x20 kmem_cache_alloc+0x1c0/0x870 jffs2_alloc_xattr_ref+0x2f/0xa0 jffs2_scan_medium.cold+0x3713/0x4794 jffs2_do_mount_fs.cold+0xa7/0x2253 jffs2_do_fill_super+0x383/0xc30 jffs2_fill_super+0x2ea/0x4c0 [...] Freed by task 719: kmem_cache_free+0xcc/0x7b0 jffs2_free_xattr_ref+0x78/0x98 jffs2_clear_xattr_subsystem+0xa1/0x6ac jffs2_do_mount_fs.cold+0x5e6/0x2253 jffs2_do_fill_super+0x383/0xc30 jffs2_fill_super+0x2ea/0x4c0 [...] The buggy address belongs to the object at ffff8881243384b8 which belongs to the cache jffs2_xattr_ref of size 48 The buggy address is located 40 bytes inside of 48-byte region [ffff8881243384b8, ffff8881243384e8) [...] ================================================================== The triggering of the BUG is shown in the following stack: ----------------------------------------------------------- jffs2_fill_super jffs2_do_fill_super jffs2_do_mount_fs jffs2_build_filesystem jffs2_scan_medium jffs2_scan_eraseblock <--- ERROR jffs2_clear_xattr_subsystem <--- free jffs2_clear_xattr_subsystem <--- free again ----------------------------------------------------------- An error is returned in jffs2_do_mount_fs(). If the error is returned by jffs2_sum_init(), the jffs2_clear_xattr_subsystem() does not need to be executed. If the error is returned by jffs2_build_filesystem(), the jffs2_clear_xattr_subsystem() also does not need to be executed again. So move jffs2_clear_xattr_subsystem() from 'out_inohash' to 'out_root' to fix this UAF problem. Fixes: aa98d7cf59b5 ("[JFFS2][XATTR] XATTR support on JFFS2 (version. 5)") Cc: stable@vger.kernel.org Reported-by: Hulk Robot Signed-off-by: Baokun Li Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/jffs2/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2ac410477c4f..71f03a5d36ed 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -603,8 +603,8 @@ out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); kvfree(c->blocks); - out_inohash: jffs2_clear_xattr_subsystem(c); + out_inohash: kfree(c->inocache_list); out_wbuf: jffs2_flash_cleanup(c); -- cgit v1.2.3 From 4392e8aeebc5a4f8073620bccba7de1b1f6d7c88 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 14 Jan 2022 18:28:53 +0800 Subject: jffs2: fix memory leak in jffs2_do_mount_fs commit d051cef784de4d54835f6b6836d98a8f6935772c upstream. If jffs2_build_filesystem() in jffs2_do_mount_fs() returns an error, we can observe the following kmemleak report: -------------------------------------------- unreferenced object 0xffff88811b25a640 (size 64): comm "mount", pid 691, jiffies 4294957728 (age 71.952s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmem_cache_alloc_trace+0x584/0x880 [] jffs2_sum_init+0x86/0x130 [] jffs2_do_mount_fs+0x798/0xac0 [] jffs2_do_fill_super+0x383/0xc30 [] jffs2_fill_super+0x2ea/0x4c0 [...] unreferenced object 0xffff88812c760000 (size 65536): comm "mount", pid 691, jiffies 4294957728 (age 71.952s) hex dump (first 32 bytes): bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ backtrace: [] __kmalloc+0x6b9/0x910 [] jffs2_sum_init+0xd7/0x130 [] jffs2_do_mount_fs+0x798/0xac0 [] jffs2_do_fill_super+0x383/0xc30 [] jffs2_fill_super+0x2ea/0x4c0 [...] -------------------------------------------- This is because the resources allocated in jffs2_sum_init() are not released. Call jffs2_sum_exit() to release these resources to solve the problem. Fixes: e631ddba5887 ("[JFFS2] Add erase block summary support (mount time improvement)") Cc: stable@vger.kernel.org Signed-off-by: Baokun Li Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/jffs2/build.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index b288c8ae1236..837cd55fd4c5 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -415,13 +415,15 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); ret = -EIO; - goto out_free; + goto out_sum_exit; } jffs2_calc_trigger_levels(c); return 0; + out_sum_exit: + jffs2_sum_exit(c); out_free: kvfree(c->blocks); -- cgit v1.2.3 From 52ba0ab4f0a606f02a6163493378989faa1ec10a Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 14 Jan 2022 18:28:54 +0800 Subject: jffs2: fix memory leak in jffs2_scan_medium commit 9cdd3128874f5fe759e2c4e1360ab7fb96a8d1df upstream. If an error is returned in jffs2_scan_eraseblock() and some memory has been added to the jffs2_summary *s, we can observe the following kmemleak report: -------------------------------------------- unreferenced object 0xffff88812b889c40 (size 64): comm "mount", pid 692, jiffies 4294838325 (age 34.288s) hex dump (first 32 bytes): 40 48 b5 14 81 88 ff ff 01 e0 31 00 00 00 50 00 @H........1...P. 00 00 01 00 00 00 01 00 00 00 02 00 00 00 09 08 ................ backtrace: [] __kmalloc+0x613/0x910 [] jffs2_sum_add_dirent_mem+0x5c/0xa0 [] jffs2_scan_medium.cold+0x36e5/0x4794 [] jffs2_do_mount_fs.cold+0xa7/0x2267 [] jffs2_do_fill_super+0x383/0xc30 [] jffs2_fill_super+0x2ea/0x4c0 [] mtd_get_sb+0x254/0x400 [] mtd_get_sb_by_nr+0x4f/0xd0 [] get_tree_mtd+0x498/0x840 [] jffs2_get_tree+0x25/0x30 [] vfs_get_tree+0x8d/0x2e0 [] path_mount+0x50f/0x1e50 [] do_mount+0x107/0x130 [] __se_sys_mount+0x1c5/0x2f0 [] __x64_sys_mount+0xc7/0x160 [] do_syscall_64+0x45/0x70 unreferenced object 0xffff888114b54840 (size 32): comm "mount", pid 692, jiffies 4294838325 (age 34.288s) hex dump (first 32 bytes): c0 75 b5 14 81 88 ff ff 02 e0 02 00 00 00 02 00 .u.............. 00 00 84 00 00 00 44 00 00 00 6b 6b 6b 6b 6b a5 ......D...kkkkk. backtrace: [] kmem_cache_alloc_trace+0x584/0x880 [] jffs2_sum_add_inode_mem+0x54/0x90 [] jffs2_scan_medium.cold+0x4481/0x4794 [...] unreferenced object 0xffff888114b57280 (size 32): comm "mount", pid 692, jiffies 4294838393 (age 34.357s) hex dump (first 32 bytes): 10 d5 6c 11 81 88 ff ff 08 e0 05 00 00 00 01 00 ..l............. 00 00 38 02 00 00 28 00 00 00 6b 6b 6b 6b 6b a5 ..8...(...kkkkk. backtrace: [] kmem_cache_alloc_trace+0x584/0x880 [] jffs2_sum_add_xattr_mem+0x54/0x90 [] jffs2_scan_medium.cold+0x298c/0x4794 [...] unreferenced object 0xffff8881116cd510 (size 16): comm "mount", pid 692, jiffies 4294838395 (age 34.355s) hex dump (first 16 bytes): 00 00 00 00 00 00 00 00 09 e0 60 02 00 00 6b a5 ..........`...k. backtrace: [] kmem_cache_alloc_trace+0x584/0x880 [] jffs2_sum_add_xref_mem+0x54/0x90 [] jffs2_scan_medium.cold+0x3a20/0x4794 [...] -------------------------------------------- Therefore, we should call jffs2_sum_reset_collected(s) on exit to release the memory added in s. In addition, a new tag "out_buf" is added to prevent the NULL pointer reference caused by s being NULL. (thanks to Zhang Yi for this analysis) Fixes: e631ddba5887 ("[JFFS2] Add erase block summary support (mount time improvement)") Cc: stable@vger.kernel.org Co-developed-with: Zhihao Cheng Signed-off-by: Baokun Li Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/jffs2/scan.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index b676056826be..29671e33a171 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -136,7 +136,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) if (!s) { JFFS2_WARNING("Can't allocate memory for summary\n"); ret = -ENOMEM; - goto out; + goto out_buf; } } @@ -275,13 +275,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) } ret = 0; out: + jffs2_sum_reset_collected(s); + kfree(s); + out_buf: if (buf_size) kfree(flashbuf); #ifndef __ECOS else mtd_unpoint(c->mtd, 0, c->mtd->size); #endif - kfree(s); return ret; } -- cgit v1.2.3 From ab657a29c3e9f02a7c7ca9ddaa7b176dec04f86e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2022 14:39:34 -0700 Subject: mm: fs: fix lru_cache_disabled race in bh_lru commit c0226eb8bde854e016a594a16f5c0d98aca426fa upstream. Check lru_cache_disabled under bh_lru_lock. Otherwise, it could introduce race below and it fails to migrate pages containing buffer_head. CPU 0 CPU 1 bh_lru_install lru_cache_disable lru_cache_disabled = false atomic_inc(&lru_disable_count); invalidate_bh_lrus_cpu of CPU 0 bh_lru_lock __invalidate_bh_lrus bh_lru_unlock bh_lru_lock install the bh bh_lru_unlock WHen this race happens a CMA allocation fails, which is critical for the workload which depends on CMA. Link: https://lkml.kernel.org/r/20220308180709.2017638-1-minchan@kernel.org Fixes: 8cc621d2f45d ("mm: fs: invalidate BH LRU during page migration") Signed-off-by: Minchan Kim Cc: Chris Goldsworthy Cc: Marcelo Tosatti Cc: John Dias Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/buffer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index c615387aedca..f6d283579491 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1235,16 +1235,18 @@ static void bh_lru_install(struct buffer_head *bh) int i; check_irqs_on(); + bh_lru_lock(); + /* * the refcount of buffer_head in bh_lru prevents dropping the * attached page(i.e., try_to_free_buffers) so it could cause * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done. */ - if (lru_cache_disabled()) + if (lru_cache_disabled()) { + bh_lru_unlock(); return; - - bh_lru_lock(); + } b = this_cpu_ptr(&bh_lrus); for (i = 0; i < BH_LRU_SIZE; i++) { -- cgit v1.2.3 From f143f8334fb9eb2f6c7c15b9da1472d9c965fd84 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 29 Mar 2022 16:20:05 -0300 Subject: cifs: prevent bad output lengths in smb2_ioctl_query_info() commit b92e358757b91c2827af112cae9af513f26a3f34 upstream. When calling smb2_ioctl_query_info() with smb_query_info::flags=PASSTHRU_FSCTL and smb_query_info::output_buffer_length=0, the following would return 0x10 buffer = memdup_user(arg + sizeof(struct smb_query_info), qi.output_buffer_length); if (IS_ERR(buffer)) { kfree(vars); return PTR_ERR(buffer); } rather than a valid pointer thus making IS_ERR() check fail. This would then cause a NULL ptr deference in @buffer when accessing it later in smb2_ioctl_query_ioctl(). While at it, prevent having a @buffer smaller than 8 bytes to correctly handle SMB2_SET_INFO FileEndOfFileInformation requests when smb_query_info::flags=PASSTHRU_SET_INFO. Here is a small C reproducer which triggers a NULL ptr in @buffer when passing an invalid smb_query_info::flags #include #include #include #include #include #include #define die(s) perror(s), exit(1) #define QUERY_INFO 0xc018cf07 int main(int argc, char *argv[]) { int fd; if (argc < 2) exit(1); fd = open(argv[1], O_RDONLY); if (fd == -1) die("open"); if (ioctl(fd, QUERY_INFO, (uint32_t[]) { 0, 0, 0, 4, 0, 0}) == -1) die("ioctl"); close(fd); return 0; } mount.cifs //srv/share /mnt -o ... gcc repro.c && ./a.out /mnt/f0 [ 114.138620] general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 114.139310] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] [ 114.139775] CPU: 2 PID: 995 Comm: a.out Not tainted 5.17.0-rc8 #1 [ 114.140148] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014 [ 114.140818] RIP: 0010:smb2_ioctl_query_info+0x206/0x410 [cifs] [ 114.141221] Code: 00 00 00 00 fc ff df 48 c1 ea 03 80 3c 02 00 0f 85 c8 01 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b 7b 28 4c 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 9c 01 00 00 49 8b 3f e8 58 02 fb ff 48 8b 14 24 [ 114.142348] RSP: 0018:ffffc90000b47b00 EFLAGS: 00010256 [ 114.142692] RAX: dffffc0000000000 RBX: ffff888115503200 RCX: ffffffffa020580d [ 114.143119] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffffa043a380 [ 114.143544] RBP: ffff888115503278 R08: 0000000000000001 R09: 0000000000000003 [ 114.143983] R10: fffffbfff4087470 R11: 0000000000000001 R12: ffff888115503288 [ 114.144424] R13: 00000000ffffffea R14: ffff888115503228 R15: 0000000000000000 [ 114.144852] FS: 00007f7aeabdf740(0000) GS:ffff888151600000(0000) knlGS:0000000000000000 [ 114.145338] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 114.145692] CR2: 00007f7aeacfdf5e CR3: 000000012000e000 CR4: 0000000000350ee0 [ 114.146131] Call Trace: [ 114.146291] [ 114.146432] ? smb2_query_reparse_tag+0x890/0x890 [cifs] [ 114.146800] ? cifs_mapchar+0x460/0x460 [cifs] [ 114.147121] ? rcu_read_lock_sched_held+0x3f/0x70 [ 114.147412] ? cifs_strndup_to_utf16+0x15b/0x250 [cifs] [ 114.147775] ? dentry_path_raw+0xa6/0xf0 [ 114.148024] ? cifs_convert_path_to_utf16+0x198/0x220 [cifs] [ 114.148413] ? smb2_check_message+0x1080/0x1080 [cifs] [ 114.148766] ? rcu_read_lock_sched_held+0x3f/0x70 [ 114.149065] cifs_ioctl+0x1577/0x3320 [cifs] [ 114.149371] ? lock_downgrade+0x6f0/0x6f0 [ 114.149631] ? cifs_readdir+0x2e60/0x2e60 [cifs] [ 114.149956] ? rcu_read_lock_sched_held+0x3f/0x70 [ 114.150250] ? __rseq_handle_notify_resume+0x80b/0xbe0 [ 114.150562] ? __up_read+0x192/0x710 [ 114.150791] ? __ia32_sys_rseq+0xf0/0xf0 [ 114.151025] ? __x64_sys_openat+0x11f/0x1d0 [ 114.151296] __x64_sys_ioctl+0x127/0x190 [ 114.151549] do_syscall_64+0x3b/0x90 [ 114.151768] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 114.152079] RIP: 0033:0x7f7aead043df [ 114.152306] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <41> 89 c0 3d 00 f0 ff ff 77 1f 48 8b 44 24 18 64 48 2b 04 25 28 00 [ 114.153431] RSP: 002b:00007ffc2e0c1f80 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 114.153890] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7aead043df [ 114.154315] RDX: 00007ffc2e0c1ff0 RSI: 00000000c018cf07 RDI: 0000000000000003 [ 114.154747] RBP: 00007ffc2e0c2010 R08: 00007f7aeae03db0 R09: 00007f7aeae24c4e [ 114.155192] R10: 00007f7aeabf7d40 R11: 0000000000000246 R12: 00007ffc2e0c2128 [ 114.155642] R13: 0000000000401176 R14: 0000000000403df8 R15: 00007f7aeae57000 [ 114.156071] [ 114.156218] Modules linked in: cifs cifs_arc4 cifs_md4 bpf_preload [ 114.156608] ---[ end trace 0000000000000000 ]--- [ 114.156898] RIP: 0010:smb2_ioctl_query_info+0x206/0x410 [cifs] [ 114.157792] Code: 00 00 00 00 fc ff df 48 c1 ea 03 80 3c 02 00 0f 85 c8 01 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b 7b 28 4c 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 9c 01 00 00 49 8b 3f e8 58 02 fb ff 48 8b 14 24 [ 114.159293] RSP: 0018:ffffc90000b47b00 EFLAGS: 00010256 [ 114.159641] RAX: dffffc0000000000 RBX: ffff888115503200 RCX: ffffffffa020580d [ 114.160093] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffffa043a380 [ 114.160699] RBP: ffff888115503278 R08: 0000000000000001 R09: 0000000000000003 [ 114.161196] R10: fffffbfff4087470 R11: 0000000000000001 R12: ffff888115503288 [ 114.155642] R13: 0000000000401176 R14: 0000000000403df8 R15: 00007f7aeae57000 [ 114.156071] [ 114.156218] Modules linked in: cifs cifs_arc4 cifs_md4 bpf_preload [ 114.156608] ---[ end trace 0000000000000000 ]--- [ 114.156898] RIP: 0010:smb2_ioctl_query_info+0x206/0x410 [cifs] [ 114.157792] Code: 00 00 00 00 fc ff df 48 c1 ea 03 80 3c 02 00 0f 85 c8 01 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b 7b 28 4c 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 9c 01 00 00 49 8b 3f e8 58 02 fb ff 48 8b 14 24 [ 114.159293] RSP: 0018:ffffc90000b47b00 EFLAGS: 00010256 [ 114.159641] RAX: dffffc0000000000 RBX: ffff888115503200 RCX: ffffffffa020580d [ 114.160093] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffffa043a380 [ 114.160699] RBP: ffff888115503278 R08: 0000000000000001 R09: 0000000000000003 [ 114.161196] R10: fffffbfff4087470 R11: 0000000000000001 R12: ffff888115503288 [ 114.161823] R13: 00000000ffffffea R14: ffff888115503228 R15: 0000000000000000 [ 114.162274] FS: 00007f7aeabdf740(0000) GS:ffff888151600000(0000) knlGS:0000000000000000 [ 114.162853] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 114.163218] CR2: 00007f7aeacfdf5e CR3: 000000012000e000 CR4: 0000000000350ee0 [ 114.163691] Kernel panic - not syncing: Fatal exception [ 114.164087] Kernel Offset: disabled [ 114.164316] ---[ end Kernel panic - not syncing: Fatal exception ]--- Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bda606dc72b1..5923029ea586 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1656,11 +1656,12 @@ smb2_ioctl_query_info(const unsigned int xid, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - buffer = memdup_user(arg + sizeof(struct smb_query_info), - qi.output_buffer_length); - if (IS_ERR(buffer)) { - kfree(vars); - return PTR_ERR(buffer); + if (qi.output_buffer_length) { + buffer = memdup_user(arg + sizeof(struct smb_query_info), qi.output_buffer_length); + if (IS_ERR(buffer)) { + kfree(vars); + return PTR_ERR(buffer); + } } /* Open */ @@ -1723,10 +1724,13 @@ smb2_ioctl_query_info(const unsigned int xid, /* Can eventually relax perm check since server enforces too */ if (!capable(CAP_SYS_ADMIN)) rc = -EPERM; - else { + else if (qi.output_buffer_length < 8) + rc = -EINVAL; + else { rqst[1].rq_iov = &vars->si_iov[0]; rqst[1].rq_nvec = 1; + /* MS-FSCC 2.4.13 FileEndOfFileInformation */ size[0] = 8; data[0] = buffer; -- cgit v1.2.3 From 39a4bf7d1a23dd172526c2fb0db480c5d5c63bd6 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 29 Mar 2022 16:20:06 -0300 Subject: cifs: fix NULL ptr dereference in smb2_ioctl_query_info() commit d6f5e358452479fa8a773b5c6ccc9e4ec5a20880 upstream. When calling smb2_ioctl_query_info() with invalid smb_query_info::flags, a NULL ptr dereference is triggered when trying to kfree() uninitialised rqst[n].rq_iov array. This also fixes leaked paths that are created in SMB2_open_init() which required SMB2_open_free() to properly free them. Here is a small C reproducer that triggers it #include #include #include #include #include #include #define die(s) perror(s), exit(1) #define QUERY_INFO 0xc018cf07 int main(int argc, char *argv[]) { int fd; if (argc < 2) exit(1); fd = open(argv[1], O_RDONLY); if (fd == -1) die("open"); if (ioctl(fd, QUERY_INFO, (uint32_t[]) { 0, 0, 0, 4, 0, 0}) == -1) die("ioctl"); close(fd); return 0; } mount.cifs //srv/share /mnt -o ... gcc repro.c && ./a.out /mnt/f0 [ 1832.124468] CIFS: VFS: \\w22-dc.zelda.test\test Invalid passthru query flags: 0x4 [ 1832.125043] general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 1832.125764] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] [ 1832.126241] CPU: 3 PID: 1133 Comm: a.out Not tainted 5.17.0-rc8 #2 [ 1832.126630] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014 [ 1832.127322] RIP: 0010:smb2_ioctl_query_info+0x7a3/0xe30 [cifs] [ 1832.127749] Code: 00 00 00 fc ff df 48 c1 ea 03 80 3c 02 00 0f 85 6c 05 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8b 74 24 28 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 cb 04 00 00 49 8b 3e e8 bb fc fa ff 48 89 da 48 [ 1832.128911] RSP: 0018:ffffc90000957b08 EFLAGS: 00010256 [ 1832.129243] RAX: dffffc0000000000 RBX: ffff888117e9b850 RCX: ffffffffa020580d [ 1832.129691] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffffa043a2c0 [ 1832.130137] RBP: ffff888117e9b878 R08: 0000000000000001 R09: 0000000000000003 [ 1832.130585] R10: fffffbfff4087458 R11: 0000000000000001 R12: ffff888117e9b800 [ 1832.131037] R13: 00000000ffffffea R14: 0000000000000000 R15: ffff888117e9b8a8 [ 1832.131485] FS: 00007fcee9900740(0000) GS:ffff888151a00000(0000) knlGS:0000000000000000 [ 1832.131993] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1832.132354] CR2: 00007fcee9a1ef5e CR3: 0000000114cd2000 CR4: 0000000000350ee0 [ 1832.132801] Call Trace: [ 1832.132962] [ 1832.133104] ? smb2_query_reparse_tag+0x890/0x890 [cifs] [ 1832.133489] ? cifs_mapchar+0x460/0x460 [cifs] [ 1832.133822] ? rcu_read_lock_sched_held+0x3f/0x70 [ 1832.134125] ? cifs_strndup_to_utf16+0x15b/0x250 [cifs] [ 1832.134502] ? lock_downgrade+0x6f0/0x6f0 [ 1832.134760] ? cifs_convert_path_to_utf16+0x198/0x220 [cifs] [ 1832.135170] ? smb2_check_message+0x1080/0x1080 [cifs] [ 1832.135545] cifs_ioctl+0x1577/0x3320 [cifs] [ 1832.135864] ? lock_downgrade+0x6f0/0x6f0 [ 1832.136125] ? cifs_readdir+0x2e60/0x2e60 [cifs] [ 1832.136468] ? rcu_read_lock_sched_held+0x3f/0x70 [ 1832.136769] ? __rseq_handle_notify_resume+0x80b/0xbe0 [ 1832.137096] ? __up_read+0x192/0x710 [ 1832.137327] ? __ia32_sys_rseq+0xf0/0xf0 [ 1832.137578] ? __x64_sys_openat+0x11f/0x1d0 [ 1832.137850] __x64_sys_ioctl+0x127/0x190 [ 1832.138103] do_syscall_64+0x3b/0x90 [ 1832.138378] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 1832.138702] RIP: 0033:0x7fcee9a253df [ 1832.138937] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <41> 89 c0 3d 00 f0 ff ff 77 1f 48 8b 44 24 18 64 48 2b 04 25 28 00 [ 1832.140107] RSP: 002b:00007ffeba94a8a0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 1832.140606] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fcee9a253df [ 1832.141058] RDX: 00007ffeba94a910 RSI: 00000000c018cf07 RDI: 0000000000000003 [ 1832.141503] RBP: 00007ffeba94a930 R08: 00007fcee9b24db0 R09: 00007fcee9b45c4e [ 1832.141948] R10: 00007fcee9918d40 R11: 0000000000000246 R12: 00007ffeba94aa48 [ 1832.142396] R13: 0000000000401176 R14: 0000000000403df8 R15: 00007fcee9b78000 [ 1832.142851] [ 1832.142994] Modules linked in: cifs cifs_arc4 cifs_md4 bpf_preload [last unloaded: cifs] Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 124 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 65 insertions(+), 59 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 5923029ea586..db3ead52ec7c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1631,6 +1631,7 @@ smb2_ioctl_query_info(const unsigned int xid, unsigned int size[2]; void *data[2]; int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR; + void (*free_req1_func)(struct smb_rqst *r); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -1640,17 +1641,18 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) - goto e_fault; - + if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) { + rc = -EFAULT; + goto free_vars; + } if (qi.output_buffer_length > 1024) { - kfree(vars); - return -EINVAL; + rc = -EINVAL; + goto free_vars; } if (!ses || !server) { - kfree(vars); - return -EIO; + rc = -EIO; + goto free_vars; } if (smb3_encryption_required(tcon)) @@ -1659,8 +1661,8 @@ smb2_ioctl_query_info(const unsigned int xid, if (qi.output_buffer_length) { buffer = memdup_user(arg + sizeof(struct smb_query_info), qi.output_buffer_length); if (IS_ERR(buffer)) { - kfree(vars); - return PTR_ERR(buffer); + rc = PTR_ERR(buffer); + goto free_vars; } } @@ -1699,48 +1701,45 @@ smb2_ioctl_query_info(const unsigned int xid, rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, path); if (rc) - goto iqinf_exit; + goto free_output_buffer; smb2_set_next_command(tcon, &rqst[0]); /* Query */ if (qi.flags & PASSTHRU_FSCTL) { /* Can eventually relax perm check since server enforces too */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) { rc = -EPERM; - else { - rqst[1].rq_iov = &vars->io_iov[0]; - rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; - - rc = SMB2_ioctl_init(tcon, server, - &rqst[1], - COMPOUND_FID, COMPOUND_FID, - qi.info_type, true, buffer, - qi.output_buffer_length, - CIFSMaxBufSize - - MAX_SMB2_CREATE_RESPONSE_SIZE - - MAX_SMB2_CLOSE_RESPONSE_SIZE); + goto free_open_req; } + rqst[1].rq_iov = &vars->io_iov[0]; + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID, + qi.info_type, true, buffer, qi.output_buffer_length, + CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); + free_req1_func = SMB2_ioctl_free; } else if (qi.flags == PASSTHRU_SET_INFO) { /* Can eventually relax perm check since server enforces too */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) { rc = -EPERM; - else if (qi.output_buffer_length < 8) + goto free_open_req; + } + if (qi.output_buffer_length < 8) { rc = -EINVAL; - else { - rqst[1].rq_iov = &vars->si_iov[0]; - rqst[1].rq_nvec = 1; - - /* MS-FSCC 2.4.13 FileEndOfFileInformation */ - size[0] = 8; - data[0] = buffer; - - rc = SMB2_set_info_init(tcon, server, - &rqst[1], - COMPOUND_FID, COMPOUND_FID, - current->tgid, - FILE_END_OF_FILE_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); + goto free_open_req; } + rqst[1].rq_iov = &vars->si_iov[0]; + rqst[1].rq_nvec = 1; + + /* MS-FSCC 2.4.13 FileEndOfFileInformation */ + size[0] = 8; + data[0] = buffer; + + rc = SMB2_set_info_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID, + current->tgid, FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + free_req1_func = SMB2_set_info_free; } else if (qi.flags == PASSTHRU_QUERY_INFO) { rqst[1].rq_iov = &vars->qi_iov[0]; rqst[1].rq_nvec = 1; @@ -1751,6 +1750,7 @@ smb2_ioctl_query_info(const unsigned int xid, qi.info_type, qi.additional_information, qi.input_buffer_length, qi.output_buffer_length, buffer); + free_req1_func = SMB2_query_info_free; } else { /* unknown flags */ cifs_tcon_dbg(VFS, "Invalid passthru query flags: 0x%x\n", qi.flags); @@ -1758,7 +1758,7 @@ smb2_ioctl_query_info(const unsigned int xid, } if (rc) - goto iqinf_exit; + goto free_open_req; smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); @@ -1769,14 +1769,14 @@ smb2_ioctl_query_info(const unsigned int xid, rc = SMB2_close_init(tcon, server, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); if (rc) - goto iqinf_exit; + goto free_req_1; smb2_set_related(&rqst[2]); rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); if (rc) - goto iqinf_exit; + goto out; /* No need to bump num_remote_opens since handle immediately closed */ if (qi.flags & PASSTHRU_FSCTL) { @@ -1786,18 +1786,22 @@ smb2_ioctl_query_info(const unsigned int xid, qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount); if (qi.input_buffer_length > 0 && le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length - > rsp_iov[1].iov_len) - goto e_fault; + > rsp_iov[1].iov_len) { + rc = -EFAULT; + goto out; + } if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) - goto e_fault; + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto out; + } if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info), (const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset), qi.input_buffer_length)) - goto e_fault; + rc = -EFAULT; } else { pqi = (struct smb_query_info __user *)arg; qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; @@ -1805,28 +1809,30 @@ smb2_ioctl_query_info(const unsigned int xid, qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength); if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) - goto e_fault; + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto out; + } if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) - goto e_fault; + rc = -EFAULT; } - iqinf_exit: - cifs_small_buf_release(rqst[0].rq_iov[0].iov_base); - cifs_small_buf_release(rqst[1].rq_iov[0].iov_base); - cifs_small_buf_release(rqst[2].rq_iov[0].iov_base); +out: free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); - kfree(vars); + SMB2_close_free(&rqst[2]); +free_req_1: + free_req1_func(&rqst[1]); +free_open_req: + SMB2_open_free(&rqst[0]); +free_output_buffer: kfree(buffer); +free_vars: + kfree(vars); return rc; - -e_fault: - rc = -EFAULT; - goto iqinf_exit; } static ssize_t -- cgit v1.2.3 From 7c5312fdb1dcfdc1951b018669af88d5d6420b31 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 1 Apr 2022 11:28:15 -0700 Subject: ocfs2: fix crash when mount with quota enabled commit de19433423c7bedabbd4f9a25f7dbc62c5e78921 upstream. There is a reported crash when mounting ocfs2 with quota enabled. RIP: 0010:ocfs2_qinfo_lock_res_init+0x44/0x50 [ocfs2] Call Trace: ocfs2_local_read_info+0xb9/0x6f0 [ocfs2] dquot_load_quota_sb+0x216/0x470 dquot_load_quota_inode+0x85/0x100 ocfs2_enable_quotas+0xa0/0x1c0 [ocfs2] ocfs2_fill_super.cold+0xc8/0x1bf [ocfs2] mount_bdev+0x185/0x1b0 legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 path_mount+0x465/0xac0 __x64_sys_mount+0x103/0x140 It is caused by when initializing dqi_gqlock, the corresponding dqi_type and dqi_sb are not properly initialized. This issue is introduced by commit 6c85c2c72819, which wants to avoid accessing uninitialized variables in error cases. So make global quota info properly initialized. Link: https://lkml.kernel.org/r/20220323023644.40084-1-joseph.qi@linux.alibaba.com Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1007141 Fixes: 6c85c2c72819 ("ocfs2: quota_local: fix possible uninitialized-variable access in ocfs2_local_read_info()") Signed-off-by: Joseph Qi Reported-by: Dayvison Tested-by: Valentin Vidic Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/quota_global.c | 23 ++++++++++++----------- fs/ocfs2/quota_local.c | 2 -- 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index f033de733adb..effe92c7d693 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -337,7 +337,6 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) /* Read information header from global quota file */ int ocfs2_global_read_info(struct super_block *sb, int type) { - struct inode *gqinode = NULL; unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, GROUP_QUOTA_SYSTEM_INODE }; struct ocfs2_global_disk_dqinfo dinfo; @@ -346,29 +345,31 @@ int ocfs2_global_read_info(struct super_block *sb, int type) u64 pcount; int status; + oinfo->dqi_gi.dqi_sb = sb; + oinfo->dqi_gi.dqi_type = type; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); + oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); + oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; + oinfo->dqi_gqi_bh = NULL; + oinfo->dqi_gqi_count = 0; + /* Read global header */ - gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + oinfo->dqi_gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], OCFS2_INVALID_SLOT); - if (!gqinode) { + if (!oinfo->dqi_gqinode) { mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n", type); status = -EINVAL; goto out_err; } - oinfo->dqi_gi.dqi_sb = sb; - oinfo->dqi_gi.dqi_type = type; - oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); - oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; - oinfo->dqi_gqi_bh = NULL; - oinfo->dqi_gqi_count = 0; - oinfo->dqi_gqinode = gqinode; + status = ocfs2_lock_global_qf(oinfo, 0); if (status < 0) { mlog_errno(status); goto out_err; } - status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk, + status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk, &pcount, NULL); if (status < 0) goto out_unlock; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 0e4b16d4c037..b1a8b046f4c2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -702,8 +702,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) info->dqi_priv = oinfo; oinfo->dqi_type = type; INIT_LIST_HEAD(&oinfo->dqi_chunk); - oinfo->dqi_gqinode = NULL; - ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); oinfo->dqi_rec = NULL; oinfo->dqi_lqi_bh = NULL; oinfo->dqi_libh = NULL; -- cgit v1.2.3 From 6cdb84dd0c8dd030f37a710f21bf69abc7229001 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 26 Jan 2022 03:57:39 +0100 Subject: coredump: Also dump first pages of non-executable ELF libraries commit 84158b7f6a0624b81800b4e7c90f7fb7fdecf66c upstream. When I rewrote the VMA dumping logic for coredumps, I changed it to recognize ELF library mappings based on the file being executable instead of the mapping having an ELF header. But turns out, distros ship many ELF libraries as non-executable, so the heuristic goes wrong... Restore the old behavior where FILTER(ELF_HEADERS) dumps the first page of any offset-0 readable mapping that starts with the ELF magic. This fix is technically layer-breaking a bit, because it checks for something ELF-specific in fs/coredump.c; but since we probably want to share this between standard ELF and FDPIC ELF anyway, I guess it's fine? And this also keeps the change small for backporting. Cc: stable@vger.kernel.org Fixes: 429a22e776a2 ("coredump: rework elf/elf_fdpic vma_dump_size() into common helper") Reported-by: Bill Messmer Signed-off-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220126025739.2014888-1-jannh@google.com Signed-off-by: Greg Kroah-Hartman --- fs/coredump.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 3224dee44d30..bb66a4e3819f 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -992,6 +993,8 @@ static bool always_dump_vma(struct vm_area_struct *vma) return false; } +#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1 + /* * Decide how much of @vma's contents should be included in a core dump. */ @@ -1051,9 +1054,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, * dump the first page to aid in determining what was mapped here. */ if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && - (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) - return PAGE_SIZE; + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { + if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + + /* + * ELF libraries aren't always executable. + * We'll want to check whether the mapping starts with the ELF + * magic, but not now - we're holding the mmap lock, + * so copy_from_user() doesn't work here. + * Use a placeholder instead, and fix it up later in + * dump_vma_snapshot(). + */ + return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER; + } #undef FILTER @@ -1128,8 +1142,6 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); - - vma_data_size += m->dump_size; } mmap_write_unlock(mm); @@ -1139,6 +1151,23 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, return -EFAULT; } + for (i = 0; i < *vma_count; i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { + char elfmag[SELFMAG]; + + if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) || + memcmp(elfmag, ELFMAG, SELFMAG) != 0) { + m->dump_size = 0; + } else { + m->dump_size = PAGE_SIZE; + } + } + + vma_data_size += m->dump_size; + } + *vma_data_size_ptr = vma_data_size; return 0; } -- cgit v1.2.3 From 597393cde8411ebd2098c819a29c2de23f66af65 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 28 Feb 2022 10:48:15 +0800 Subject: ext4: fix fs corruption when tring to remove a non-empty directory with IO error commit 7aab5c84a0f6ec2290e2ba4a6b245178b1bf949a upstream. We inject IO error when rmdir non empty direcory, then got issue as follows: step1: mkfs.ext4 -F /dev/sda step2: mount /dev/sda test step3: cd test step4: mkdir -p 1/2 step5: rmdir 1 [ 110.920551] ext4_empty_dir: inject fault [ 110.921926] EXT4-fs warning (device sda): ext4_rmdir:3113: inode #12: comm rmdir: empty directory '1' has too many links (3) step6: cd .. step7: umount test step8: fsck.ext4 -f /dev/sda e2fsck 1.42.9 (28-Dec-2013) Pass 1: Checking inodes, blocks, and sizes Pass 2: Checking directory structure Entry '..' in .../??? (13) has deleted/unused inode 12. Clear? yes Pass 3: Checking directory connectivity Unconnected directory inode 13 (...) Connect to /lost+found? yes Pass 4: Checking reference counts Inode 13 ref count is 3, should be 2. Fix? yes Pass 5: Checking group summary information /dev/sda: ***** FILE SYSTEM WAS MODIFIED ***** /dev/sda: 12/131072 files (0.0% non-contiguous), 26157/524288 blocks ext4_rmdir if (!ext4_empty_dir(inode)) goto end_rmdir; ext4_empty_dir bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); if (IS_ERR(bh)) return true; Now if read directory block failed, 'ext4_empty_dir' will return true, assume directory is empty. Obviously, it will lead to above issue. To solve this issue, if read directory block failed 'ext4_empty_dir' just return false. To avoid making things worse when file system is already corrupted, 'ext4_empty_dir' also return false. Signed-off-by: Ye Bin Cc: stable@kernel.org Link: https://lore.kernel.org/r/20220228024815.3952506-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inline.c | 9 ++++----- fs/ext4/namei.c | 10 +++++----- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d091133a4b46..46fdb40c3962 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1788,19 +1788,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; - bool ret = true; + bool ret = false; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE_ERR(dir, -err, "error %d getting inode %lu block", err, dir->i_ino); - return true; + return false; } down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; + ret = true; goto out; } @@ -1809,7 +1810,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); - ret = true; goto out; } @@ -1828,16 +1828,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); - ret = true; goto out; } if (le32_to_cpu(de->inode)) { - ret = false; goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); } + ret = true; out: up_read(&EXT4_I(dir)->xattr_sem); brelse(iloc.bh); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 192ea90e757c..8cb5ea7ee506 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2997,14 +2997,14 @@ bool ext4_empty_dir(struct inode *inode) if (inode->i_size < ext4_dir_rec_len(1, NULL) + ext4_dir_rec_len(2, NULL)) { EXT4_ERROR_INODE(inode, "invalid size"); - return true; + return false; } /* The first directory block must not be a hole, * so treat it as DIRENT_HTREE */ bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); if (IS_ERR(bh)) - return true; + return false; de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, @@ -3012,7 +3012,7 @@ bool ext4_empty_dir(struct inode *inode) le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); - return true; + return false; } offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize); @@ -3021,7 +3021,7 @@ bool ext4_empty_dir(struct inode *inode) le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); - return true; + return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { @@ -3035,7 +3035,7 @@ bool ext4_empty_dir(struct inode *inode) continue; } if (IS_ERR(bh)) - return true; + return false; } de = (struct ext4_dir_entry_2 *) (bh->b_data + (offset & (sb->s_blocksize - 1))); -- cgit v1.2.3 From 3c65b7309d2e0dd8d134e7813cd95debd91ea07b Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 8 Mar 2022 15:22:01 +0530 Subject: ext4: make mb_optimize_scan performance mount option work with extents commit 077d0c2c78df6f7260cdd015a991327efa44d8ad upstream. Currently mb_optimize_scan scan feature which improves filesystem performance heavily (when FS is fragmented), seems to be not working with files with extents (ext4 by default has files with extents). This patch fixes that and makes mb_optimize_scan feature work for files with extents. Below are some performance numbers obtained when allocating a 10M and 100M file with and w/o this patch on a filesytem with no 1M contiguous block. =============== Workload: dd if=/dev/urandom of=test conv=fsync bs=1M count=10/100 Time taken ===================================================== no. Size without-patch with-patch Diff(%) 1 10M 0m8.401s 0m5.623s 33.06% 2 100M 1m40.465s 1m14.737s 25.6% ============= w/o patch: mballoc: reqs: 17056 success: 11407 groups_scanned: 13643 cr0_stats: hits: 37 groups_considered: 9472 useless_loops: 36 bad_suggestions: 0 cr1_stats: hits: 11418 groups_considered: 908560 useless_loops: 1894 bad_suggestions: 0 cr2_stats: hits: 1873 groups_considered: 6913 useless_loops: 21 cr3_stats: hits: 21 groups_considered: 5040 useless_loops: 21 extents_scanned: 417364 goal_hits: 3707 2^n_hits: 37 breaks: 1873 lost: 0 buddies_generated: 239/240 buddies_time_used: 651080 preallocated: 705 discarded: 478 with patch: mballoc: reqs: 12768 success: 11305 groups_scanned: 12768 cr0_stats: hits: 1 groups_considered: 18 useless_loops: 0 bad_suggestions: 0 cr1_stats: hits: 5829 groups_considered: 50626 useless_loops: 0 bad_suggestions: 0 cr2_stats: hits: 6938 groups_considered: 580363 useless_loops: 0 cr3_stats: hits: 0 groups_considered: 0 useless_loops: 0 extents_scanned: 309059 goal_hits: 0 2^n_hits: 1 breaks: 1463 lost: 0 buddies_generated: 239/240 buddies_time_used: 791392 preallocated: 673 discarded: 446 Fixes: 196e402 (ext4: improve cr 0 / cr 1 group scanning) Cc: stable@kernel.org Reported-by: Geetika Moolchandani Reported-by: Nageswara R Sastry Suggested-by: Ritesh Harjani Signed-off-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/fc9a48f7f8dcfc83891a8b21f6dd8cdf056ed810.1646732698.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 74e3286d0e26..d2c906d3f63b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1000,7 +1000,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) return 0; if (ac->ac_criteria >= 2) return 0; - if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) + if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; return 1; } -- cgit v1.2.3 From bc5f440e1c5c86a09b4045049552fd31b5aea293 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 14 Mar 2022 19:59:53 +0100 Subject: pstore: Don't use semaphores in always-atomic-context code commit 8126b1c73108bc691f5643df19071a59a69d0bc6 upstream. pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior Signed-off-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/efi-pstore.c | 2 +- fs/pstore/platform.c | 38 ++++++++++++++++++-------------------- include/linux/pstore.h | 6 +++--- 3 files changed, 22 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 0ef086e43090..7e771c56c13c 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -266,7 +266,7 @@ static int efi_pstore_write(struct pstore_record *record) efi_name[i] = name[i]; ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES, - preemptible(), record->size, record->psi->buf); + false, record->size, record->psi->buf); if (record->reason == KMSG_DUMP_OOPS && try_module_get(THIS_MODULE)) if (!schedule_work(&efivar_work)) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b9614db48b1d..ad96ba97d8f9 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -143,21 +143,22 @@ static void pstore_timer_kick(void) mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); } -/* - * Should pstore_dump() wait for a concurrent pstore_dump()? If - * not, the current pstore_dump() will report a failure to dump - * and return. - */ -static bool pstore_cannot_wait(enum kmsg_dump_reason reason) +static bool pstore_cannot_block_path(enum kmsg_dump_reason reason) { - /* In NMI path, pstore shouldn't block regardless of reason. */ + /* + * In case of NMI path, pstore shouldn't be blocked + * regardless of reason. + */ if (in_nmi()) return true; switch (reason) { /* In panic case, other cpus are stopped by smp_send_stop(). */ case KMSG_DUMP_PANIC: - /* Emergency restart shouldn't be blocked. */ + /* + * Emergency restart shouldn't be blocked by spinning on + * pstore_info::buf_lock. + */ case KMSG_DUMP_EMERG: return true; default: @@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long total = 0; const char *why; unsigned int part = 1; + unsigned long flags = 0; int ret; why = kmsg_dump_reason_str(reason); - if (down_trylock(&psinfo->buf_lock)) { - /* Failed to acquire lock: give up if we cannot wait. */ - if (pstore_cannot_wait(reason)) { - pr_err("dump skipped in %s path: may corrupt error record\n", - in_nmi() ? "NMI" : why); - return; - } - if (down_interruptible(&psinfo->buf_lock)) { - pr_err("could not grab semaphore?!\n"); + if (pstore_cannot_block_path(reason)) { + if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) { + pr_err("dump skipped in %s path because of concurrent dump\n", + in_nmi() ? "NMI" : why); return; } + } else { + spin_lock_irqsave(&psinfo->buf_lock, flags); } kmsg_dump_rewind(&iter); @@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += record.size; part++; } - - up(&psinfo->buf_lock); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } static struct kmsg_dumper pstore_dumper = { @@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi) psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); - sema_init(&psinfo->buf_lock, 1); + spin_lock_init(&psinfo->buf_lock); if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); diff --git a/include/linux/pstore.h b/include/linux/pstore.h index eb93a54cff31..e97a8188f0fd 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -87,7 +87,7 @@ struct pstore_record { * @owner: module which is responsible for this backend driver * @name: name of the backend driver * - * @buf_lock: semaphore to serialize access to @buf + * @buf_lock: spinlock to serialize access to @buf * @buf: preallocated crash dump buffer * @bufsize: size of @buf available for crash dump bytes (must match * smallest number of bytes available for writing to a @@ -178,7 +178,7 @@ struct pstore_info { struct module *owner; const char *name; - struct semaphore buf_lock; + spinlock_t buf_lock; char *buf; size_t bufsize; -- cgit v1.2.3 From 1290eb4412aa0f0e9f3434b406dc8e255da85f9e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 31 Jan 2022 16:09:47 -0800 Subject: exec: Force single empty string when argv is empty commit dcd46d897adb70d63e025f175a00a89797d31a43 upstream. Quoting[1] Ariadne Conill: "In several other operating systems, it is a hard requirement that the second argument to execve(2) be the name of a program, thus prohibiting a scenario where argc < 1. POSIX 2017 also recommends this behaviour, but it is not an explicit requirement[2]: The argument arg0 should point to a filename string that is associated with the process being started by one of the exec functions. ... Interestingly, Michael Kerrisk opened an issue about this in 2008[3], but there was no consensus to support fixing this issue then. Hopefully now that CVE-2021-4034 shows practical exploitative use[4] of this bug in a shellcode, we can reconsider. This issue is being tracked in the KSPP issue tracker[5]." While the initial code searches[6][7] turned up what appeared to be mostly corner case tests, trying to that just reject argv == NULL (or an immediately terminated pointer list) quickly started tripping[8] existing userspace programs. The next best approach is forcing a single empty string into argv and adjusting argc to match. The number of programs depending on argc == 0 seems a smaller set than those calling execve with a NULL argv. Account for the additional stack space in bprm_stack_limits(). Inject an empty string when argc == 0 (and set argc = 1). Warn about the case so userspace has some notice about the change: process './argc0' launched './argc0' with NULL argv: empty string added Additionally WARN() and reject NULL argv usage for kernel threads. [1] https://lore.kernel.org/lkml/20220127000724.15106-1-ariadne@dereferenced.org/ [2] https://pubs.opengroup.org/onlinepubs/9699919799/functions/exec.html [3] https://bugzilla.kernel.org/show_bug.cgi?id=8408 [4] https://www.qualys.com/2022/01/25/cve-2021-4034/pwnkit.txt [5] https://github.com/KSPP/linux/issues/176 [6] https://codesearch.debian.net/search?q=execve%5C+*%5C%28%5B%5E%2C%5D%2B%2C+*NULL&literal=0 [7] https://codesearch.debian.net/search?q=execlp%3F%5Cs*%5C%28%5B%5E%2C%5D%2B%2C%5Cs*NULL&literal=0 [8] https://lore.kernel.org/lkml/20220131144352.GE16385@xsang-OptiPlex-9020/ Reported-by: Ariadne Conill Reported-by: Michael Kerrisk Cc: Matthew Wilcox Cc: Christian Brauner Cc: Rich Felker Cc: Eric Biederman Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Christian Brauner Acked-by: Ariadne Conill Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220201000947.2453721-1-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- fs/exec.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index ac7b51b51f38..29e865c59854 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -494,8 +494,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm) * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. + * + * In the case of argc = 0, make sure there is space for adding a + * empty string (which will bump argc to 1), to ensure confused + * userspace programs don't start processing from argv[1], thinking + * argc can never be 0, to keep them from walking envp by accident. + * See do_execveat_common(). */ - ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; @@ -1895,6 +1901,9 @@ static int do_execveat_common(int fd, struct filename *filename, } retval = count(argv, MAX_ARG_STRINGS); + if (retval == 0) + pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", + current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; @@ -1921,6 +1930,19 @@ static int do_execveat_common(int fd, struct filename *filename, if (retval < 0) goto out_free; + /* + * When argv is empty, add an empty string ("") as argv[0] to + * ensure confused userspace programs that start processing + * from argv[1] won't end up walking envp. See also + * bprm_stack_limits(). + */ + if (bprm->argc == 0) { + retval = copy_string_kernel("", bprm); + if (retval < 0) + goto out_free; + bprm->argc = 1; + } + retval = bprm_execve(bprm, fd, filename, flags); out_free: free_bprm(bprm); @@ -1949,6 +1971,8 @@ int kernel_execve(const char *kernel_filename, } retval = count_strings_kernel(argv); + if (WARN_ON_ONCE(retval == 0)) + retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; -- cgit v1.2.3 From 68a8120e1647b6c9cf82687d5f7c6a96ee3b8e97 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 18 Feb 2022 13:14:19 +0900 Subject: btrfs: zoned: mark relocation as writing commit ca5e4ea0beaec8bc674121838bf8614c089effb9 upstream. There is a hung_task issue with running generic/068 on an SMR device. The hang occurs while a process is trying to thaw the filesystem. The process is trying to take sb->s_umount to thaw the FS. The lock is held by fsstress, which calls btrfs_sync_fs() and is waiting for an ordered extent to finish. However, as the FS is frozen, the ordered extents never finish. Having an ordered extent while the FS is frozen is the root cause of the hang. The ordered extent is initiated from btrfs_relocate_chunk() which is called from btrfs_reclaim_bgs_work(). This commit adds sb_*_write() around btrfs_relocate_chunk() call site. For the usual "btrfs balance" command, we already call it with mnt_want_file() in btrfs_ioctl_balance(). Fixes: 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones") CC: stable@vger.kernel.org # 5.13+ Link: https://github.com/naota/linux/issues/56 Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/block-group.c | 8 +++++++- fs/btrfs/volumes.c | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e1c5c2114edf..53ff3db9640e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1504,8 +1504,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + sb_start_write(fs_info->sb); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + sb_end_write(fs_info->sb); return; + } /* * Long running balances can keep us blocked here for eternity, so @@ -1513,6 +1517,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return; } @@ -1581,6 +1586,7 @@ next: spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 06a1a7c2254c..57021cc83811 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -8185,10 +8185,12 @@ static int relocating_repair_kthread(void *data) target = cache->start; btrfs_put_block_group(cache); + sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); + sb_end_write(fs_info->sb); return -EBUSY; } @@ -8216,6 +8218,7 @@ out: btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return ret; } -- cgit v1.2.3 From f85ee0c845fd42b63bc10dd3e37e68115ad3a710 Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Fri, 25 Feb 2022 22:20:28 +0100 Subject: btrfs: extend locking to all space_info members accesses commit 06bae876634ebf837ba70ea3de532b288326103d upstream. bytes_pinned is always accessed under space_info->lock, except in btrfs_preempt_reclaim_metadata_space, however the other members are accessed under that lock. The reserved member of the rsv's are also partially accessed under a lock and partially not. Move all these accesses into the same lock to ensure consistency. This could potentially race and lead to a flush instead of a commit but it's not a big problem as it's only for preemptive flush. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Niels Dossche Signed-off-by: Niels Dossche Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/space-info.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index aa5be0b24987..5ed66a794e57 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1054,7 +1054,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv->reserved; if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; - spin_unlock(&space_info->lock); /* * We don't want to include the global_rsv in our calculation, @@ -1085,6 +1084,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush = FLUSH_DELAYED_REFS_NR; } + spin_unlock(&space_info->lock); + /* * We don't want to reclaim everything, just a portion, so scale * down the to_reclaim by 1/4. If it takes us down to 0, -- cgit v1.2.3 From 00c6bb4cea62597bd80cdda14eae6d1998018b99 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Mar 2022 09:10:21 +0800 Subject: btrfs: verify the tranisd of the to-be-written dirty extent buffer commit 3777369ff1518b579560611a0d0c33f930154f64 upstream. [BUG] There is a bug report that a bitflip in the transid part of an extent buffer makes btrfs to reject certain tree blocks: BTRFS error (device dm-0): parent transid verify failed on 1382301696 wanted 262166 found 22 [CAUSE] Note the failed transid check, hex(262166) = 0x40016, while hex(22) = 0x16. It's an obvious bitflip. Furthermore, the reporter also confirmed the bitflip is from the hardware, so it's a real hardware caused bitflip, and such problem can not be detected by the existing tree-checker framework. As tree-checker can only verify the content inside one tree block, while generation of a tree block can only be verified against its parent. So such problem remain undetected. [FIX] Although tree-checker can not verify it at write-time, we still have a quick (but not the most accurate) way to catch such obvious corruption. Function csum_one_extent_buffer() is called before we submit metadata write. Thus it means, all the extent buffer passed in should be dirty tree blocks, and should be newer than last committed transaction. Using that we can catch the above bitflip. Although it's not a perfect solution, as if the corrupted generation is higher than the correct value, we have no way to catch it at all. Reported-by: Christoph Anton Mitterer Link: https://lore.kernel.org/linux-btrfs/2dfcbc130c55cc6fd067b93752e90bd2b079baca.camel@scientia.org/ CC: stable@vger.kernel.org # 5.15+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d5a590b11be5..aed1f26ca2b8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -441,17 +441,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) else ret = btrfs_check_leaf_full(eb); - if (ret < 0) { - btrfs_print_tree(eb, 0); + if (ret < 0) + goto error; + + /* + * Also check the generation, the eb reached here must be newer than + * last committed. Or something seriously wrong happened. + */ + if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + ret = -EUCLEAN; btrfs_err(fs_info, - "block=%llu write time tree block corruption detected", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return ret; + "block=%llu bad generation, have %llu expect > %llu", + eb->start, btrfs_header_generation(eb), + fs_info->last_trans_committed); + goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; + +error: + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; } /* Checksum all dirty extent buffers in one bio_vec */ -- cgit v1.2.3 From 1323976e9448ace3af334235e4e0c1858119e782 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Mar 2022 17:26:19 -0600 Subject: io_uring: don't check unrelated req->open.how in accept request [ Upstream commit adf3a9e9f556613197583a1884f0de40a8bb6fb9 ] Looks like a victim of too much copy/paste, we should not be looking at req->open.how in accept. The point is to check CLOEXEC and error out, which we don't invalid direct descriptors on exec. Hence any attempt to get a direct descriptor with CLOEXEC is invalid. No harm is done here, as req->open.how.flags overlaps with req->accept.flags, but it's very confusing and might change if either of those command structs are modified. Fixes: aaa4db12ef7b ("io_uring: accept directly into fixed file table") Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 156c54ebb62b..70e85f64dc38 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5154,8 +5154,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->nofile = rlimit(RLIMIT_NOFILE); accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || - (accept->flags & SOCK_CLOEXEC))) + if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) return -EINVAL; if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; -- cgit v1.2.3 From 109dda45102112799e507a65e375ebb096d3ae49 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Mar 2022 11:28:13 -0600 Subject: io_uring: terminate manual loop iterator loop correctly for non-vecs [ Upstream commit 5e929367468c8f97cd1ffb0417316cecfebef94b ] The fix for not advancing the iterator if we're using fixed buffers is broken in that it can hit a condition where we don't terminate the loop. This results in io-wq looping forever, asking to read (or write) 0 bytes for every subsequent loop. Reported-by: Joel Jaeschke Link: https://github.com/axboe/liburing/issues/549 Fixes: 16c8d2df7ec0 ("io_uring: ensure symmetry in handling iter types in loop_rw_iter()") Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 70e85f64dc38..ec0b50940405 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3319,13 +3319,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) ret = nr; break; } + ret += nr; if (!iov_iter_is_bvec(iter)) { iov_iter_advance(iter, nr); } else { - req->rw.len -= nr; req->rw.addr += nr; + req->rw.len -= nr; + if (!req->rw.len) + break; } - ret += nr; if (nr != iovec.iov_len) break; } -- cgit v1.2.3 From d8c8dd97bb8cd5d1ab2857850e6166e51417435e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 Jan 2022 11:48:02 +0800 Subject: f2fs: fix to enable ATGC correctly via gc_idle sysfs interface [ Upstream commit 7d19e3dab0002e527052b0aaf986e8c32e5537bf ] It needs to assign sbi->gc_mode with GC_IDLE_AT rather than GC_AT when user tries to enable ATGC via gc_idle sysfs interface, fix it. Fixes: 093749e296e2 ("f2fs: support age threshold based garbage collection") Cc: Zhipeng Tan Signed-off-by: Jicheng Shao Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index abc4344fba39..8b36e61fe7ed 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -473,7 +473,7 @@ out: } else if (t == GC_IDLE_AT) { if (!sbi->am.atgc_enabled) return -EINVAL; - sbi->gc_mode = GC_AT; + sbi->gc_mode = GC_IDLE_AT; } else { sbi->gc_mode = GC_NORMAL; } -- cgit v1.2.3 From d1eaaf6cadedf638e17017b0a04d56308405ac10 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Feb 2022 18:56:46 -0800 Subject: f2fs: fix missing free nid in f2fs_handle_failed_inode [ Upstream commit 2fef99b8372c1ae3d8445ab570e888b5a358dbe9 ] This patch fixes xfstests/generic/475 failure. [ 293.680694] F2FS-fs (dm-1): May loss orphan inode, run fsck to fix. [ 293.685358] Buffer I/O error on dev dm-1, logical block 8388592, async page read [ 293.691527] Buffer I/O error on dev dm-1, logical block 8388592, async page read [ 293.691764] sh (7615): drop_caches: 3 [ 293.691819] sh (7616): drop_caches: 3 [ 293.694017] Buffer I/O error on dev dm-1, logical block 1, async page read [ 293.695659] sh (7618): drop_caches: 3 [ 293.696979] sh (7617): drop_caches: 3 [ 293.700290] sh (7623): drop_caches: 3 [ 293.708621] sh (7626): drop_caches: 3 [ 293.711386] sh (7628): drop_caches: 3 [ 293.711825] sh (7627): drop_caches: 3 [ 293.716738] sh (7630): drop_caches: 3 [ 293.719613] sh (7632): drop_caches: 3 [ 293.720971] sh (7633): drop_caches: 3 [ 293.727741] sh (7634): drop_caches: 3 [ 293.730783] sh (7636): drop_caches: 3 [ 293.732681] sh (7635): drop_caches: 3 [ 293.732988] sh (7637): drop_caches: 3 [ 293.738836] sh (7639): drop_caches: 3 [ 293.740568] sh (7641): drop_caches: 3 [ 293.743053] sh (7640): drop_caches: 3 [ 293.821889] ------------[ cut here ]------------ [ 293.824654] kernel BUG at fs/f2fs/node.c:3334! [ 293.826226] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 293.828713] CPU: 0 PID: 7653 Comm: umount Tainted: G OE 5.17.0-rc1-custom #1 [ 293.830946] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 293.832526] RIP: 0010:f2fs_destroy_node_manager+0x33f/0x350 [f2fs] [ 293.833905] Code: e8 d6 3d f9 f9 48 8b 45 d0 65 48 2b 04 25 28 00 00 00 75 1a 48 81 c4 28 03 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b [ 293.837783] RSP: 0018:ffffb04ec31e7a20 EFLAGS: 00010202 [ 293.839062] RAX: 0000000000000001 RBX: ffff9df947db2eb8 RCX: 0000000080aa0072 [ 293.840666] RDX: 0000000000000000 RSI: ffffe86c0432a140 RDI: ffffffffc0b72a21 [ 293.842261] RBP: ffffb04ec31e7d70 R08: ffff9df94ca85780 R09: 0000000080aa0072 [ 293.843909] R10: ffff9df94ca85700 R11: ffff9df94e1ccf58 R12: ffff9df947db2e00 [ 293.845594] R13: ffff9df947db2ed0 R14: ffff9df947db2eb8 R15: ffff9df947db2eb8 [ 293.847855] FS: 00007f5a97379800(0000) GS:ffff9dfa77c00000(0000) knlGS:0000000000000000 [ 293.850647] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 293.852940] CR2: 00007f5a97528730 CR3: 000000010bc76005 CR4: 0000000000370ef0 [ 293.854680] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 293.856423] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 293.858380] Call Trace: [ 293.859302] [ 293.860311] ? ttwu_do_wakeup+0x1c/0x170 [ 293.861800] ? ttwu_do_activate+0x6d/0xb0 [ 293.863057] ? _raw_spin_unlock_irqrestore+0x29/0x40 [ 293.864411] ? try_to_wake_up+0x9d/0x5e0 [ 293.865618] ? debug_smp_processor_id+0x17/0x20 [ 293.866934] ? debug_smp_processor_id+0x17/0x20 [ 293.868223] ? free_unref_page+0xbf/0x120 [ 293.869470] ? __free_slab+0xcb/0x1c0 [ 293.870614] ? preempt_count_add+0x7a/0xc0 [ 293.871811] ? __slab_free+0xa0/0x2d0 [ 293.872918] ? __wake_up_common_lock+0x8a/0xc0 [ 293.874186] ? __slab_free+0xa0/0x2d0 [ 293.875305] ? free_inode_nonrcu+0x20/0x20 [ 293.876466] ? free_inode_nonrcu+0x20/0x20 [ 293.877650] ? debug_smp_processor_id+0x17/0x20 [ 293.878949] ? call_rcu+0x11a/0x240 [ 293.880060] ? f2fs_destroy_stats+0x59/0x60 [f2fs] [ 293.881437] ? kfree+0x1fe/0x230 [ 293.882674] f2fs_put_super+0x160/0x390 [f2fs] [ 293.883978] generic_shutdown_super+0x7a/0x120 [ 293.885274] kill_block_super+0x27/0x50 [ 293.886496] kill_f2fs_super+0x7f/0x100 [f2fs] [ 293.887806] deactivate_locked_super+0x35/0xa0 [ 293.889271] deactivate_super+0x40/0x50 [ 293.890513] cleanup_mnt+0x139/0x190 [ 293.891689] __cleanup_mnt+0x12/0x20 [ 293.892850] task_work_run+0x64/0xa0 [ 293.894035] exit_to_user_mode_prepare+0x1b7/0x1c0 [ 293.895409] syscall_exit_to_user_mode+0x27/0x50 [ 293.896872] do_syscall_64+0x48/0xc0 [ 293.898090] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 293.899517] RIP: 0033:0x7f5a975cd25b Fixes: 7735730d39d7 ("f2fs: fix to propagate error from __get_meta_page()") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 4557de37a911..6488f5ff250c 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -876,6 +876,7 @@ void f2fs_handle_failed_inode(struct inode *inode) err = f2fs_get_node_info(sbi, inode->i_ino, &ni); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); + set_inode_flag(inode, FI_FREE_NID); f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); goto out; } -- cgit v1.2.3 From cc91880f041705062aab4cc1d4e1271f4ecabd08 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 24 Feb 2022 18:17:05 +0200 Subject: nfsd: more robust allocation failure handling in nfsd_file_cache_init [ Upstream commit 4d2eeafecd6c83b4444db3dc0ada201c89b1aa44 ] The nfsd file cache table can be pretty large and its allocation may require as many as 80 contigious pages. Employ the same fix that was employed for similar issue that was reported for the reply cache hash table allocation several years ago by commit 8f97514b423a ("nfsd: more robust allocation failure handling in nfsd_reply_cache_init"). Fixes: 65294c1f2c5e ("nfsd: add a new struct file caching facility to nfsd") Link: https://lore.kernel.org/linux-nfs/e3cdaeec85a6cfec980e87fc294327c0381c1778.camel@kernel.org/ Suggested-by: Jeff Layton Signed-off-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Tested-by: Amir Goldstein Signed-off-by: Sasha Levin --- fs/nfsd/filecache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index be3c1aad50ea..7e23c588f484 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -641,7 +641,7 @@ nfsd_file_cache_init(void) if (!nfsd_filecache_wq) goto out; - nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE, + nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE, sizeof(*nfsd_file_hashtbl), GFP_KERNEL); if (!nfsd_file_hashtbl) { pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n"); @@ -709,7 +709,7 @@ out_err: nfsd_file_slab = NULL; kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - kfree(nfsd_file_hashtbl); + kvfree(nfsd_file_hashtbl); nfsd_file_hashtbl = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; @@ -855,7 +855,7 @@ nfsd_file_cache_shutdown(void) fsnotify_wait_marks_destroyed(); kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - kfree(nfsd_file_hashtbl); + kvfree(nfsd_file_hashtbl); nfsd_file_hashtbl = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; -- cgit v1.2.3 From 2911ad0249c5ab50afc640982811f11896233b65 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Jan 2022 13:44:49 +0800 Subject: f2fs: fix to avoid potential deadlock [ Upstream commit 344150999b7fc88502a65bbb147a47503eca2033 ] Quoted from Jing Xia's report, there is a potential deadlock may happen between kworker and checkpoint as below: [T:writeback] [T:checkpoint] - wb_writeback - blk_start_plug bio contains NodeA was plugged in writeback threads - do_writepages -- sync write inodeB, inc wb_sync_req[DATA] - f2fs_write_data_pages - f2fs_write_single_data_page -- write last dirty page - f2fs_do_write_data_page - set_page_writeback -- clear page dirty flag and PAGECACHE_TAG_DIRTY tag in radix tree - f2fs_outplace_write_data - f2fs_update_data_blkaddr - f2fs_wait_on_page_writeback -- wait NodeA to writeback here - inode_dec_dirty_pages - writeback_sb_inodes - writeback_single_inode - do_writepages - f2fs_write_data_pages -- skip writepages due to wb_sync_req[DATA] - wbc->pages_skipped += get_dirty_pages() -- PAGECACHE_TAG_DIRTY is not set but get_dirty_pages() returns one - requeue_inode -- requeue inode to wb->b_dirty queue due to non-zero.pages_skipped - blk_finish_plug Let's try to avoid deadlock condition by forcing unplugging previous bio via blk_finish_plug(current->plug) once we'v skipped writeback in writepages() due to valid sbi->wb_sync_req[DATA/NODE]. Fixes: 687de7f1010c ("f2fs: avoid IO split due to mixed WB_SYNC_ALL and WB_SYNC_NONE") Signed-off-by: Zhiguo Niu Signed-off-by: Jing Xia Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/data.c | 6 +++++- fs/f2fs/node.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e662355cf8c9..f6e9fc36b837 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3214,8 +3214,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping, /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[DATA]); - else if (atomic_read(&sbi->wb_sync_req[DATA])) + else if (atomic_read(&sbi->wb_sync_req[DATA])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } if (__should_serialize_io(inode, wbc)) { mutex_lock(&sbi->writepages); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 556fcd8457f3..69c6bcaf5aae 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2106,8 +2106,12 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[NODE]); - else if (atomic_read(&sbi->wb_sync_req[NODE])) + else if (atomic_read(&sbi->wb_sync_req[NODE])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } trace_f2fs_writepages(mapping->host, wbc, NODE); -- cgit v1.2.3 From 1a97987f76b404c684caaba15485d0412923c621 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 17 Feb 2022 12:12:07 +0000 Subject: btrfs: fix unexpected error path when reflinking an inline extent [ Upstream commit 1f4613cdbe7739ce291554b316bff8e551383389 ] When reflinking an inline extent, we assert that its file offset is 0 and that its uncompressed length is not greater than the sector size. We then return an error if one of those conditions is not satisfied. However we use a return statement, which results in returning from btrfs_clone() without freeing the path and buffer that were allocated before, as well as not clearing the flag BTRFS_INODE_NO_DELALLOC_FLUSH for the destination inode. Fix that by jumping to the 'out' label instead, and also add a WARN_ON() for each condition so that in case assertions are disabled, we get to known which of the unexpected conditions triggered the error. Fixes: a61e1e0df9f321 ("Btrfs: simplify inline extent handling when doing reflinks") Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/reflink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index c71e49782e86..fa60af00ebca 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -505,8 +505,11 @@ process_slot: */ ASSERT(key.offset == 0); ASSERT(datal <= fs_info->sectorsize); - if (key.offset != 0 || datal > fs_info->sectorsize) - return -EUCLEAN; + if (WARN_ON(key.offset != 0) || + WARN_ON(datal > fs_info->sectorsize)) { + ret = -EUCLEAN; + goto out; + } ret = clone_copy_inline_extent(inode, path, &new_key, drop_start, datal, size, -- cgit v1.2.3 From 0f42a02e4773efc4f1edf202fc73d0cd14e8fb09 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 18 Mar 2022 09:23:04 +0800 Subject: f2fs: fix compressed file start atomic write may cause data corruption [ Upstream commit 9b56adcf525522e9ffa52471260298d91fc1d395 ] When compressed file has blocks, f2fs_ioc_start_atomic_write will succeed, but compressed flag will be remained in inode. If write partial compreseed cluster and commit atomic write will cause data corruption. This is the reproduction process: Step 1: create a compressed file ,write 64K data , call fsync(), then the blocks are write as compressed cluster. Step2: iotcl(F2FS_IOC_START_ATOMIC_WRITE) --- this should be fail, but not. write page 0 and page 3. iotcl(F2FS_IOC_COMMIT_ATOMIC_WRITE) -- page 0 and 3 write as normal file, Step3: drop cache. read page 0-4 -- Since page 0 has a valid block address, read as non-compressed cluster, page 1 and 2 will be filled with compressed data or zero. The root cause is, after commit 7eab7a696827 ("f2fs: compress: remove unneeded read when rewrite whole cluster"), in step 2, f2fs_write_begin() only set target page dirty, and in f2fs_commit_inmem_pages(), we will write partial raw pages into compressed cluster, result in corrupting compressed cluster layout. Fixes: 4c8ff7095bef ("f2fs: support data compression") Fixes: 7eab7a696827 ("f2fs: compress: remove unneeded read when rewrite whole cluster") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/data.c | 2 +- fs/f2fs/file.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f6e9fc36b837..4cf522120cb1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3410,7 +3410,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *fsdata = NULL; - if (len == PAGE_SIZE) + if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) goto repeat; ret = f2fs_prepare_compress_overwrite(inode, pagep, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7ed44752c758..0e14dc41ed4e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2002,7 +2002,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - f2fs_disable_compressed_file(inode); + if (!f2fs_disable_compressed_file(inode)) { + ret = -EINVAL; + goto out; + } if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) -- cgit v1.2.3 From 3813591bc046f8c4739a6be0c92acf05a1a1d729 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 12 Feb 2022 13:05:32 +0800 Subject: ext2: correct max file size computing [ Upstream commit 50b3a818991074177a56c87124c7a7bdf5fa4f67 ] We need to calculate the max file size accurately if the total blocks that can address by block tree exceed the upper_limit. But this check is not correct now, it only compute the total data blocks but missing metadata blocks are needed. So in the case of "data blocks < upper_limit && total blocks > upper_limit", we will get wrong result. Fortunately, this case could not happen in reality, but it's confused and better to correct the computing. bits data blocks metadatablocks upper_limit 10 16843020 66051 2147483647 11 134480396 263171 1073741823 12 1074791436 1050627 536870911 (*) 13 8594130956 4198403 268435455 (*) 14 68736258060 16785411 134217727 (*) 15 549822930956 67125251 67108863 (*) 16 4398314962956 268468227 33554431 (*) [*] Need to calculate in depth. Fixes: 1c2d14212b15 ("ext2: Fix underflow in ext2_max_size()") Link: https://lore.kernel.org/r/20220212050532.179055-1-yi.zhang@huawei.com Signed-off-by: Zhang Yi Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/ext2/super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d8d580b609ba..3d21279fe2cb 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -753,8 +753,12 @@ static loff_t ext2_max_size(int bits) res += 1LL << (bits-2); res += 1LL << (2*(bits-2)); res += 1LL << (3*(bits-2)); + /* Compute how many metadata blocks are needed */ + meta_blocks = 1; + meta_blocks += 1 + ppb; + meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ - if (res < upper_limit) + if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; -- cgit v1.2.3 From ba3a3390c9b17abd71ac44f4a8f7647f3e4e01fc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Feb 2022 15:58:38 -0500 Subject: NFS: Use of mapping_set_error() results in spurious errors [ Upstream commit 6c984083ec2453dfd3fcf98f392f34500c73e3f2 ] The use of mapping_set_error() in conjunction with calls to filemap_check_errors() is problematic because every error gets reported as either an EIO or an ENOSPC by filemap_check_errors() in functions such as filemap_write_and_wait() or filemap_write_and_wait_range(). In almost all cases, we prefer to use the more nuanced wb errors. Fixes: b8946d7bfb94 ("NFS: Revalidate the file mapping on all fatal writeback errors") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/write.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7dce3e735fc5..0691b0b02147 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -314,7 +314,10 @@ static void nfs_mapping_set_error(struct page *page, int error) struct address_space *mapping = page_file_mapping(page); SetPageError(page); - mapping_set_error(mapping, error); + filemap_set_wb_err(mapping, error); + if (mapping->host) + errseq_set(&mapping->host->i_sb->s_wb_err, + error == -ENOSPC ? -ENOSPC : -EIO); nfs_set_pageerror(mapping); } -- cgit v1.2.3 From 18dc19571210ca5a8b505234719f6b6c61c2b2a8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 24 Feb 2022 10:59:37 -0500 Subject: NFS: Return valid errors from nfs2/3_decode_dirent() [ Upstream commit 64cfca85bacde54caa64e0ab855c48734894fa37 ] Valid return values for decode_dirent() callback functions are: 0: Success -EBADCOOKIE: End of directory -EAGAIN: End of xdr_stream All errors need to map into one of those three values. Fixes: 573c4e1ef53a ("NFS: Simplify ->decode_dirent() calling sequence") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/nfs2xdr.c | 2 +- fs/nfs/nfs3xdr.c | 21 ++++++--------------- 2 files changed, 7 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 7fba7711e6b3..3d5ba43f44bb 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -949,7 +949,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_filename_inline(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return -EAGAIN; /* * The type (size and byte order) of nfscookie isn't defined in diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 54a1d21cbcc6..7ab60ad98776 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1967,7 +1967,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, bool plus) { struct user_namespace *userns = rpc_userns(entry->server->client); - struct nfs_entry old = *entry; __be32 *p; int error; u64 new_cookie; @@ -1987,15 +1986,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_fileid3(xdr, &entry->ino); if (unlikely(error)) - return error; + return -EAGAIN; error = decode_inline_filename3(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return -EAGAIN; error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) - return error; + return -EAGAIN; entry->d_type = DT_UNKNOWN; @@ -2003,7 +2002,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->fattr->valid = 0; error = decode_post_op_attr(xdr, entry->fattr, userns); if (unlikely(error)) - return error; + return -EAGAIN; if (entry->fattr->valid & NFS_ATTR_FATTR_V3) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); @@ -2018,11 +2017,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, return -EAGAIN; if (*p != xdr_zero) { error = decode_nfs_fh3(xdr, entry->fh); - if (unlikely(error)) { - if (error == -E2BIG) - goto out_truncated; - return error; - } + if (unlikely(error)) + return -EAGAIN; } else zero_nfs_fh3(entry->fh); } @@ -2031,11 +2027,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->cookie = new_cookie; return 0; - -out_truncated: - dprintk("NFS: directory entry contains invalid file handle\n"); - *entry = old; - return -EAGAIN; } /* -- cgit v1.2.3 From 75ee75cc360a8ce0ba3a3e796658152f9b3c4694 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Tue, 15 Feb 2022 13:17:04 +0300 Subject: NFS: remove unneeded check in decode_devicenotify_args() [ Upstream commit cb8fac6d2727f79f211e745b16c9abbf4d8be652 ] [You don't often get email from khoroshilov@ispras.ru. Learn why this is important at http://aka.ms/LearnAboutSenderIdentification.] Overflow check in not needed anymore after we switch to kmalloc_array(). Signed-off-by: Alexey Khoroshilov Fixes: a4f743a6bb20 ("NFSv4.1: Convert open-coded array allocation calls to kmalloc_array()") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/callback_xdr.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index ce3d1d5b1291..ea17085ef884 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -271,10 +271,6 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, n = ntohl(*p++); if (n == 0) goto out; - if (n > ULONG_MAX / sizeof(*args->devs)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL); if (!args->devs) { -- cgit v1.2.3 From 6bbfe9a715ea075114213c6c8b30dbd2adc9624d Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sat, 19 Mar 2022 22:30:00 +0300 Subject: jfs: fix divide error in dbNextAG [ Upstream commit 2cc7cc01c15f57d056318c33705647f87dcd4aab ] Syzbot reported divide error in dbNextAG(). The problem was in missing validation check for malicious image. Syzbot crafted an image with bmp->db_numag equal to 0. There wasn't any validation checks, but dbNextAG() blindly use bmp->db_numag in divide expression Fix it by validating bmp->db_numag in dbMount() and return an error if image is malicious Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+46f5c25af73eb8330eb6@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: Dave Kleikamp Signed-off-by: Sasha Levin --- fs/jfs/jfs_dmap.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 91f4ec93dab1..d8502f4989d9 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -148,6 +148,7 @@ static const s8 budtab[256] = { * 0 - success * -ENOMEM - insufficient memory * -EIO - i/o error + * -EINVAL - wrong bmap data */ int dbMount(struct inode *ipbmap) { @@ -179,6 +180,12 @@ int dbMount(struct inode *ipbmap) bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); + if (!bmp->db_numag) { + release_metapage(mp); + kfree(bmp); + return -EINVAL; + } + bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); -- cgit v1.2.3 From 0445609a7acebfc4fe3849c62adae7893140e7ed Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 24 Mar 2022 10:38:42 -0400 Subject: NFSv4.1: don't retry BIND_CONN_TO_SESSION on session error [ Upstream commit 1d15d121cc2ad4d016a7dc1493132a9696f91fc5 ] There is no reason to retry the operation if a session error had occurred in such case result structure isn't filled out. Fixes: dff58530c4ca ("NFSv4.1: fix handling of backchannel binding in BIND_CONN_TO_SESSION") Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 53be03681f69..dababa6cf3f4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8341,6 +8341,7 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) case -NFS4ERR_DEADSESSION: nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + return; } if (args->dir == NFS4_CDFC4_FORE_OR_BOTH && res->dir != NFS4_CDFS4_BOTH) { -- cgit v1.2.3 From c95578235888431e61d36f212ad2236e0ff8b90f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 25 Mar 2022 21:51:03 -0400 Subject: NFS: Don't loop forever in nfs_do_recoalesce() [ Upstream commit d02d81efc7564b4d5446a02e0214a164cf00b1f3 ] If __nfs_pageio_add_request() fails to add the request, it will return with either desc->pg_error < 0, or mirror->pg_recoalesce will be set, so we are guaranteed either to exit the function altogether, or to loop. However if there is nothing left in mirror->pg_list to coalesce, we must exit, so make sure that we clear mirror->pg_recoalesce every time we loop. Reported-by: Olga Kornievskaia Fixes: 70536bf4eb07 ("NFS: Clean up reset of the mirror accounting variables") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/pagelist.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index cc232d1f16f2..b1130dc200d2 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1227,6 +1227,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) do { list_splice_init(&mirror->pg_list, &head); + mirror->pg_recoalesce = 0; while (!list_empty(&head)) { struct nfs_page *req; -- cgit v1.2.3 From a738ff8143d89abccfeb458d07a9d559ade7f1a1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Mar 2022 08:36:34 -0400 Subject: NFSv4/pNFS: Fix another issue with a list iterator pointing to the head [ Upstream commit 7c9d845f0612e5bcd23456a2ec43be8ac43458f1 ] In nfs4_callback_devicenotify(), if we don't find a matching entry for the deviceid, we're left with a pointer to 'struct nfs_server' that actually points to the list of super blocks associated with our struct nfs_client. Furthermore, even if we have a valid pointer, nothing pins the super block, and so the struct nfs_server could end up getting freed while we're using it. Since all we want is a pointer to the struct pnfs_layoutdriver_type, let's skip all the iteration over super blocks, and just use APIs to find the layout driver directly. Reported-by: Xiaomeng Tong Fixes: 1be5683b03a7 ("pnfs: CB_NOTIFY_DEVICEID") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/callback_proc.c | 27 +++++++++------------------ fs/nfs/pnfs.c | 11 +++++++++++ fs/nfs/pnfs.h | 2 ++ 3 files changed, 22 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f2bc5b5b764b..a30dd35ec1c2 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -358,12 +358,11 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; + const struct pnfs_layoutdriver_type *ld = NULL; uint32_t i; __be32 res = 0; - struct nfs_client *clp = cps->clp; - struct nfs_server *server = NULL; - if (!clp) { + if (!cps->clp) { res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); goto out; } @@ -371,23 +370,15 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, for (i = 0; i < args->ndevs; i++) { struct cb_devicenotifyitem *dev = &args->devs[i]; - if (!server || - server->pnfs_curr_ld->id != dev->cbd_layout_type) { - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) - if (server->pnfs_curr_ld && - server->pnfs_curr_ld->id == dev->cbd_layout_type) { - rcu_read_unlock(); - goto found; - } - rcu_read_unlock(); - continue; + if (!ld || ld->id != dev->cbd_layout_type) { + pnfs_put_layoutdriver(ld); + ld = pnfs_find_layoutdriver(dev->cbd_layout_type); + if (!ld) + continue; } - - found: - nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id); } - + pnfs_put_layoutdriver(ld); out: kfree(args->devs); return res; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7c9090a28e5c..7ddd003ab8b1 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -92,6 +92,17 @@ find_pnfs_driver(u32 id) return local; } +const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id) +{ + return find_pnfs_driver(id); +} + +void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld) +{ + if (ld) + module_put(ld->owner); +} + void unset_pnfs_layoutdriver(struct nfs_server *nfss) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index a0f6ff094b3a..5a54cf8ac6f3 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -238,6 +238,8 @@ struct pnfs_devicelist { extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); +extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id); +extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld); /* nfs4proc.c */ extern size_t max_response_pages(struct nfs_server *server); -- cgit v1.2.3 From 136736abcd35f51a4be76e6cb1a1c3544a4580fa Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Mar 2022 15:06:39 -0700 Subject: fs: fd tables have to be multiples of BITS_PER_LONG [ Upstream commit 1c24a186398f59c80adb9a967486b65c1423a59d ] This has always been the rule: fdtables have several bitmaps in them, and as a result they have to be sized properly for bitmaps. We walk those bitmaps in chunks of 'unsigned long' in serveral cases, but even when we don't, we use the regular kernel bitops that are defined to work on arrays of 'unsigned long', not on some byte array. Now, the distinction between arrays of bytes and 'unsigned long' normally only really ends up being noticeable on big-endian systems, but Fedor Pchelkin and Alexey Khoroshilov reported that copy_fd_bitmaps() could be called with an argument that wasn't even a multiple of BITS_PER_BYTE. And then it fails to do the proper copy even on little-endian machines. The bug wasn't in copy_fd_bitmap(), but in sane_fdtable_size(), which didn't actually sanitize the fdtable size sufficiently, and never made sure it had the proper BITS_PER_LONG alignment. That's partly because the alignment historically came not from having to explicitly align things, but simply from previous fdtable sizes, and from count_open_files(), which counts the file descriptors by walking them one 'unsigned long' word at a time and thus naturally ends up doing sizing in the proper 'chunks of unsigned long'. But with the introduction of close_range(), we now have an external source of "this is how many files we want to have", and so sane_fdtable_size() needs to do a better job. This also adds that explicit alignment to alloc_fdtable(), although there it is mainly just for documentation at a source code level. The arithmetic we do there to pick a reasonable fdtable size already aligns the result sufficiently. In fact,clang notices that the added ALIGN() in that function doesn't actually do anything, and does not generate any extra code for it. It turns out that gcc ends up confusing itself by combining a previous constant-sized shift operation with the variable-sized shift operations in roundup_pow_of_two(). And probably due to that doesn't notice that the ALIGN() is a no-op. But that's a (tiny) gcc misfeature that doesn't matter. Having the explicit alignment makes sense, and would actually matter on a 128-bit architecture if we ever go there. This also adds big comments above both functions about how fdtable sizes have to have that BITS_PER_LONG alignment. Fixes: 60997c3d45d9 ("close_range: add CLOSE_RANGE_UNSHARE") Reported-by: Fedor Pchelkin Reported-by: Alexey Khoroshilov Link: https://lore.kernel.org/all/20220326114009.1690-1-aissur0002@gmail.com/ Tested-and-acked-by: Christian Brauner Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/file.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 97d212a9b814..c01c29417ae6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -87,6 +87,21 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); } +/* + * Note how the fdtable bitmap allocations very much have to be a multiple of + * BITS_PER_LONG. This is not only because we walk those things in chunks of + * 'unsigned long' in some places, but simply because that is how the Linux + * kernel bitmaps are defined to work: they are not "bits in an array of bytes", + * they are very much "bits in an array of unsigned long". + * + * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied + * by that "1024/sizeof(ptr)" before, we already know there are sufficient + * clear low bits. Clang seems to realize that, gcc ends up being confused. + * + * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, + * let's consider it documentation (and maybe a test-case for gcc to improve + * its code generation ;) + */ static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; @@ -102,6 +117,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); + nr = ALIGN(nr, BITS_PER_LONG); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. Deal @@ -269,11 +285,25 @@ static unsigned int count_open_files(struct fdtable *fdt) return i; } +/* + * Note that a sane fdtable size always has to be a multiple of + * BITS_PER_LONG, since we have bitmaps that are sized by this. + * + * 'max_fds' will normally already be properly aligned, but it + * turns out that in the close_range() -> __close_range() -> + * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end + * up having a 'max_fds' value that isn't already aligned. + * + * Rather than make close_range() have to worry about this, + * just make that BITS_PER_LONG alignment be part of a sane + * fdtable size. Becuase that's really what it is. + */ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) { unsigned int count; count = count_open_files(fdt); + max_fds = ALIGN(max_fds, BITS_PER_LONG); if (max_fds < NR_OPEN_DEFAULT) max_fds = NR_OPEN_DEFAULT; return min(count, max_fds); -- cgit v1.2.3 From c331c9d1d2b7a5ce875f8f1ac53e3987f73ecc75 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Mar 2022 23:29:18 -0700 Subject: fs: fix fd table size alignment properly [ Upstream commit d888c83fcec75194a8a48ccd283953bdba7b2550 ] Jason Donenfeld reports that my commit 1c24a186398f ("fs: fd tables have to be multiples of BITS_PER_LONG") doesn't work, and the reason is an embarrassing brown-paper-bag bug. Yes, we want to align the number of fds to BITS_PER_LONG, and yes, the reason they might not be aligned is because the incoming 'max_fd' argument might not be aligned. But aligining the argument - while simple - will cause a "infinitely big" maxfd (eg NR_OPEN_MAX) to just overflow to zero. Which most definitely isn't what we want either. The obvious fix was always just to do the alignment last, but I had moved it earlier just to make the patch smaller and the code look simpler. Duh. It certainly made _me_ look simple. Fixes: 1c24a186398f ("fs: fd tables have to be multiples of BITS_PER_LONG") Reported-and-tested-by: Jason A. Donenfeld Cc: Fedor Pchelkin Cc: Alexey Khoroshilov Cc: Christian Brauner Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index c01c29417ae6..ee9317346702 100644 --- a/fs/file.c +++ b/fs/file.c @@ -303,10 +303,9 @@ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) unsigned int count; count = count_open_files(fdt); - max_fds = ALIGN(max_fds, BITS_PER_LONG); if (max_fds < NR_OPEN_DEFAULT) max_fds = NR_OPEN_DEFAULT; - return min(count, max_fds); + return ALIGN(min(count, max_fds), BITS_PER_LONG); } /* -- cgit v1.2.3 From e0943c456b6002675e9e778745bd12c8c0611dca Mon Sep 17 00:00:00 2001 From: Akira Kawata Date: Thu, 27 Jan 2022 21:40:16 +0900 Subject: fs/binfmt_elf: Fix AT_PHDR for unusual ELF files [ Upstream commit 0da1d5002745cdc721bc018b582a8a9704d56c42 ] BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=197921 As pointed out in the discussion of buglink, we cannot calculate AT_PHDR as the sum of load_addr and exec->e_phoff. : The AT_PHDR of ELF auxiliary vectors should point to the memory address : of program header. But binfmt_elf.c calculates this address as follows: : : NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); : : which is wrong since e_phoff is the file offset of program header and : load_addr is the memory base address from PT_LOAD entry. : : The ld.so uses AT_PHDR as the memory address of program header. In normal : case, since the e_phoff is usually 64 and in the first PT_LOAD region, it : is the correct program header address. : : But if the address of program header isn't equal to the first PT_LOAD : address + e_phoff (e.g. Put the program header in other non-consecutive : PT_LOAD region), ld.so will try to read program header from wrong address : then crash or use incorrect program header. This is because exec->e_phoff is the offset of PHDRs in the file and the address of PHDRs in the memory may differ from it. This patch fixes the bug by calculating the address of program headers from PT_LOADs directly. Signed-off-by: Akira Kawata Reported-by: kernel test robot Acked-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220127124014.338760-2-akirakawata1@gmail.com Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index a813b70f594e..3f6a7cac68fd 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -170,8 +170,8 @@ static int padzero(unsigned long elf_bss) static int create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr, - unsigned long e_entry) + unsigned long interp_load_addr, + unsigned long e_entry, unsigned long phdr_addr) { struct mm_struct *mm = current->mm; unsigned long p = bprm->p; @@ -257,7 +257,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); + NEW_AUX_ENT(AT_PHDR, phdr_addr); NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); @@ -823,7 +823,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr = 0, load_bias = 0; + unsigned long load_addr, load_bias = 0, phdr_addr = 0; int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; @@ -1156,6 +1156,17 @@ out_free_interp: reloc_func_desc = load_bias; } } + + /* + * Figure out which segment in the file contains the Program + * Header table, and map to the associated memory address. + */ + if (elf_ppnt->p_offset <= elf_ex->e_phoff && + elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { + phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + + elf_ppnt->p_vaddr; + } + k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; @@ -1191,6 +1202,7 @@ out_free_interp: } e_entry = elf_ex->e_entry + load_bias; + phdr_addr += load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1254,8 +1266,8 @@ out_free_interp: goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, elf_ex, - load_addr, interp_load_addr, e_entry); + retval = create_elf_tables(bprm, elf_ex, interp_load_addr, + e_entry, phdr_addr); if (retval < 0) goto out; -- cgit v1.2.3 From 572d14e6cec4738d9e2878bd2cf8c070e3b64708 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 16 Feb 2022 12:32:43 +0530 Subject: ext4: correct cluster len and clusters changed accounting in ext4_mb_mark_bb [ Upstream commit a5c0e2fdf7cea535ba03259894dc184e5a4c2800 ] ext4_mb_mark_bb() currently wrongly calculates cluster len (clen) and flex_group->free_clusters. This patch fixes that. Identified based on code review of ext4_mb_mark_bb() function. Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/a0b035d536bafa88110b74456853774b64c8ac40.1644992609.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d2c906d3f63b..d88f1364fa5a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3899,10 +3899,11 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; - int i, clen, err; + int i, err; int already; + unsigned int clen, clen_changed; - clen = EXT4_B2C(sbi, len); + clen = EXT4_NUM_B2C(sbi, len); ext4_get_group_no_and_offset(sb, block, &group, &blkoff); bitmap_bh = ext4_read_block_bitmap(sb, group); @@ -3923,6 +3924,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state) already++; + clen_changed = clen - already; if (state) ext4_set_bits(bitmap_bh->b_data, blkoff, clen); else @@ -3935,9 +3937,9 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, group, gdp)); } if (state) - clen = ext4_free_group_clusters(sb, gdp) - clen + already; + clen = ext4_free_group_clusters(sb, gdp) - clen_changed; else - clen = ext4_free_group_clusters(sb, gdp) + clen - already; + clen = ext4_free_group_clusters(sb, gdp) + clen_changed; ext4_free_group_clusters_set(sb, gdp, clen); ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); @@ -3947,10 +3949,13 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, group); + struct flex_groups *fg = sbi_array_rcu_deref(sbi, + s_flex_groups, flex_group); - atomic64_sub(len, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); + if (state) + atomic64_sub(clen_changed, &fg->free_clusters); + else + atomic64_add(clen_changed, &fg->free_clusters); } err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); -- cgit v1.2.3 From 6a6beb074186a0452368a023a261c7d0eaebe838 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 16 Feb 2022 12:32:44 +0530 Subject: ext4: fix ext4_mb_mark_bb() with flex_bg with fast_commit [ Upstream commit bfdc502a4a4c058bf4cbb1df0c297761d528f54d ] In case of flex_bg feature (which is by default enabled), extents for any given inode might span across blocks from two different block group. ext4_mb_mark_bb() only reads the buffer_head of block bitmap once for the starting block group, but it fails to read it again when the extent length boundary overflows to another block group. Then in this below loop it accesses memory beyond the block group bitmap buffer_head and results into a data abort. for (i = 0; i < clen; i++) if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state) already++; This patch adds this functionality for checking block group boundary in ext4_mb_mark_bb() and update the buffer_head(bitmap_bh) for every different block group. w/o this patch, I was easily able to hit a data access abort using Power platform. <...> [ 74.327662] EXT4-fs error (device loop3): ext4_mb_generate_buddy:1141: group 11, block bitmap and bg descriptor inconsistent: 21248 vs 23294 free clusters [ 74.533214] EXT4-fs (loop3): shut down requested (2) [ 74.536705] Aborting journal on device loop3-8. [ 74.702705] BUG: Unable to handle kernel data access on read at 0xc00000005e980000 [ 74.703727] Faulting instruction address: 0xc0000000007bffb8 cpu 0xd: Vector: 300 (Data Access) at [c000000015db7060] pc: c0000000007bffb8: ext4_mb_mark_bb+0x198/0x5a0 lr: c0000000007bfeec: ext4_mb_mark_bb+0xcc/0x5a0 sp: c000000015db7300 msr: 800000000280b033 dar: c00000005e980000 dsisr: 40000000 current = 0xc000000027af6880 paca = 0xc00000003ffd5200 irqmask: 0x03 irq_happened: 0x01 pid = 5167, comm = mount <...> enter ? for help [c000000015db7380] c000000000782708 ext4_ext_clear_bb+0x378/0x410 [c000000015db7400] c000000000813f14 ext4_fc_replay+0x1794/0x2000 [c000000015db7580] c000000000833f7c do_one_pass+0xe9c/0x12a0 [c000000015db7710] c000000000834504 jbd2_journal_recover+0x184/0x2d0 [c000000015db77c0] c000000000841398 jbd2_journal_load+0x188/0x4a0 [c000000015db7880] c000000000804de8 ext4_fill_super+0x2638/0x3e10 [c000000015db7a40] c0000000005f8404 get_tree_bdev+0x2b4/0x350 [c000000015db7ae0] c0000000007ef058 ext4_get_tree+0x28/0x40 [c000000015db7b00] c0000000005f6344 vfs_get_tree+0x44/0x100 [c000000015db7b70] c00000000063c408 path_mount+0xdd8/0xe70 [c000000015db7c40] c00000000063c8f0 sys_mount+0x450/0x550 [c000000015db7d50] c000000000035770 system_call_exception+0x4a0/0x4e0 [c000000015db7e10] c00000000000c74c system_call_common+0xec/0x250 Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/2609bc8f66fc15870616ee416a18a3d392a209c4.1644992609.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 131 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d88f1364fa5a..46608723b6ed 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3901,72 +3901,93 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, ext4_grpblk_t blkoff; int i, err; int already; - unsigned int clen, clen_changed; + unsigned int clen, clen_changed, thisgrp_len; - clen = EXT4_NUM_B2C(sbi, len); - - ext4_get_group_no_and_offset(sb, block, &group, &blkoff); - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto out_err; - } - - err = -EIO; - gdp = ext4_get_group_desc(sb, group, &gdp_bh); - if (!gdp) - goto out_err; + while (len > 0) { + ext4_get_group_no_and_offset(sb, block, &group, &blkoff); - ext4_lock_group(sb, group); - already = 0; - for (i = 0; i < clen; i++) - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state) - already++; - - clen_changed = clen - already; - if (state) - ext4_set_bits(bitmap_bh->b_data, blkoff, clen); - else - mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen); - if (ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, - group, gdp)); - } - if (state) - clen = ext4_free_group_clusters(sb, gdp) - clen_changed; - else - clen = ext4_free_group_clusters(sb, gdp) + clen_changed; + /* + * Check to see if we are freeing blocks across a group + * boundary. + * In case of flex_bg, this can happen that (block, len) may + * span across more than one group. In that case we need to + * get the corresponding group metadata to work with. + * For this we have goto again loop. + */ + thisgrp_len = min_t(unsigned int, (unsigned int)len, + EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); + clen = EXT4_NUM_B2C(sbi, thisgrp_len); - ext4_free_group_clusters_set(sb, gdp, clen); - ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, group, gdp); + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; + break; + } - ext4_unlock_group(sb, group); + err = -EIO; + gdp = ext4_get_group_desc(sb, group, &gdp_bh); + if (!gdp) + break; - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, group); - struct flex_groups *fg = sbi_array_rcu_deref(sbi, - s_flex_groups, flex_group); + ext4_lock_group(sb, group); + already = 0; + for (i = 0; i < clen; i++) + if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == + !state) + already++; + clen_changed = clen - already; if (state) - atomic64_sub(clen_changed, &fg->free_clusters); + ext4_set_bits(bitmap_bh->b_data, blkoff, clen); else - atomic64_add(clen_changed, &fg->free_clusters); + mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + } + if (state) + clen = ext4_free_group_clusters(sb, gdp) - clen_changed; + else + clen = ext4_free_group_clusters(sb, gdp) + clen_changed; + + ext4_free_group_clusters_set(sb, gdp, clen); + ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + + ext4_unlock_group(sb, group); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group); + struct flex_groups *fg = sbi_array_rcu_deref(sbi, + s_flex_groups, flex_group); + + if (state) + atomic64_sub(clen_changed, &fg->free_clusters); + else + atomic64_add(clen_changed, &fg->free_clusters); + + } + + err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); + if (err) + break; + sync_dirty_buffer(bitmap_bh); + err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); + sync_dirty_buffer(gdp_bh); + if (err) + break; + + block += thisgrp_len; + len -= thisgrp_len; + brelse(bitmap_bh); + BUG_ON(len < 0); } - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); if (err) - goto out_err; - sync_dirty_buffer(bitmap_bh); - err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); - sync_dirty_buffer(gdp_bh); - -out_err: - brelse(bitmap_bh); + brelse(bitmap_bh); } /* -- cgit v1.2.3 From a0856764dc1276ad2dc7891288c2e9246bf11a37 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 3 Mar 2022 09:38:47 -0500 Subject: ext4: don't BUG if someone dirty pages without asking ext4 first [ Upstream commit cc5095747edfb054ca2068d01af20be3fcc3634f ] [un]pin_user_pages_remote is dirtying pages without properly warning the file system in advance. A related race was noted by Jan Kara in 2018[1]; however, more recently instead of it being a very hard-to-hit race, it could be reliably triggered by process_vm_writev(2) which was discovered by Syzbot[2]. This is technically a bug in mm/gup.c, but arguably ext4 is fragile in that if some other kernel subsystem dirty pages without properly notifying the file system using page_mkwrite(), ext4 will BUG, while other file systems will not BUG (although data will still be lost). So instead of crashing with a BUG, issue a warning (since there may be potential data loss) and just mark the page as clean to avoid unprivileged denial of service attacks until the problem can be properly fixed. More discussion and background can be found in the thread starting at [2]. [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz [2] https://lore.kernel.org/r/Yg0m6IjcNmfaSokM@google.com Reported-by: syzbot+d59332e2db681cf18f0318a06e994ebbb529a8db@syzkaller.appspotmail.com Reported-by: Lee Jones Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/YiDS9wVfq4mM2jGK@mit.edu Signed-off-by: Sasha Levin --- fs/ext4/inode.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 22a5140546fb..fff52292c01e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1992,6 +1992,15 @@ static int ext4_writepage(struct page *page, else len = PAGE_SIZE; + /* Should never happen but for bugs in other kernel subsystems */ + if (!page_has_buffers(page)) { + ext4_warning_inode(inode, + "page %lu does not have buffers attached", page->index); + ClearPageDirty(page); + unlock_page(page); + return 0; + } + page_bufs = page_buffers(page); /* * We cannot do block allocation or other extent handling in this @@ -2595,6 +2604,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); + /* + * Should never happen but for buggy code in + * other subsystems that call + * set_page_dirty() without properly warning + * the file system first. See [1] for more + * information. + * + * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz + */ + if (!page_has_buffers(page)) { + ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index); + ClearPageDirty(page); + unlock_page(page); + continue; + } + if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; -- cgit v1.2.3 From f68caedf264a95c0b02dfd0d9f92ac2637d5848a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Mar 2022 09:49:13 +0800 Subject: f2fs: fix to do sanity check on curseg->alloc_type [ Upstream commit f41ee8b91c00770d718be2ff4852a80017ae9ab3 ] As Wenqing Liu reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215657 - Overview UBSAN: array-index-out-of-bounds in fs/f2fs/segment.c:3460:2 when mount and operate a corrupted image - Reproduce tested on kernel 5.17-rc4, 5.17-rc6 1. mkdir test_crash 2. cd test_crash 3. unzip tmp2.zip 4. mkdir mnt 5. ./single_test.sh f2fs 2 - Kernel dump [ 46.434454] loop0: detected capacity change from 0 to 131072 [ 46.529839] F2FS-fs (loop0): Mounted with checkpoint version = 7548c2d9 [ 46.738319] ================================================================================ [ 46.738412] UBSAN: array-index-out-of-bounds in fs/f2fs/segment.c:3460:2 [ 46.738475] index 231 is out of range for type 'unsigned int [2]' [ 46.738539] CPU: 2 PID: 939 Comm: umount Not tainted 5.17.0-rc6 #1 [ 46.738547] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 46.738551] Call Trace: [ 46.738556] [ 46.738563] dump_stack_lvl+0x47/0x5c [ 46.738581] ubsan_epilogue+0x5/0x50 [ 46.738592] __ubsan_handle_out_of_bounds+0x68/0x80 [ 46.738604] f2fs_allocate_data_block+0xdff/0xe60 [f2fs] [ 46.738819] do_write_page+0xef/0x210 [f2fs] [ 46.738934] f2fs_do_write_node_page+0x3f/0x80 [f2fs] [ 46.739038] __write_node_page+0x2b7/0x920 [f2fs] [ 46.739162] f2fs_sync_node_pages+0x943/0xb00 [f2fs] [ 46.739293] f2fs_write_checkpoint+0x7bb/0x1030 [f2fs] [ 46.739405] kill_f2fs_super+0x125/0x150 [f2fs] [ 46.739507] deactivate_locked_super+0x60/0xc0 [ 46.739517] deactivate_super+0x70/0xb0 [ 46.739524] cleanup_mnt+0x11a/0x200 [ 46.739532] __cleanup_mnt+0x16/0x20 [ 46.739538] task_work_run+0x67/0xa0 [ 46.739547] exit_to_user_mode_prepare+0x18c/0x1a0 [ 46.739559] syscall_exit_to_user_mode+0x26/0x40 [ 46.739568] do_syscall_64+0x46/0xb0 [ 46.739584] entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is we missed to do sanity check on curseg->alloc_type, result in out-of-bound accessing on sbi->block_count[] array, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/segment.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d716553bdc02..338a57360bb8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4747,6 +4747,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) sanity_check_seg_type(sbi, curseg->seg_type); + if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) { + f2fs_err(sbi, + "Current segment has invalid alloc_type:%d", + curseg->alloc_type); + return -EFSCORRUPTED; + } + if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; -- cgit v1.2.3 From 7260793c13e9ae730867f0f6515d1af84f24308a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 14 Feb 2022 15:30:13 -0500 Subject: NFSD: Fix nfsd_breaker_owns_lease() return values [ Upstream commit 50719bf3442dd6cd05159e9c98d020b3919ce978 ] These have been incorrect since the function was introduced. A proper kerneldoc comment is added since this function, though static, is part of an external interface. Reported-by: Dai Ngo Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/nfsd/nfs4state.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index db4a47a280dc..181bc3d9f566 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4693,6 +4693,14 @@ nfsd_break_deleg_cb(struct file_lock *fl) return ret; } +/** + * nfsd_breaker_owns_lease - Check if lease conflict was resolved + * @fl: Lock state to check + * + * Return values: + * %true: Lease conflict was resolved + * %false: Lease conflict was not resolved. + */ static bool nfsd_breaker_owns_lease(struct file_lock *fl) { struct nfs4_delegation *dl = fl->fl_owner; @@ -4700,11 +4708,11 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl) struct nfs4_client *clp; if (!i_am_nfsd()) - return NULL; + return false; rqst = kthread_data(current); /* Note rq_prog == NFS_ACL_PROGRAM is also possible: */ if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4) - return NULL; + return false; clp = *(rqst->rq_lease_breaker); return dl->dl_stid.sc_client == clp; } -- cgit v1.2.3 From 58d3aa672d1386d4ccd2c11d6f1a09675c30f57a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Mar 2022 09:40:05 -0800 Subject: f2fs: don't get FREEZE lock in f2fs_evict_inode in frozen fs [ Upstream commit ba900534f807f0b327c92d5141c85d2313e2d55c ] Let's purge inode cache in order to avoid the below deadlock. [freeze test] shrinkder freeze_super - pwercpu_down_write(SB_FREEZE_FS) - super_cache_scan - down_read(&sb->s_umount) - prune_icache_sb - dispose_list - evict - f2fs_evict_inode thaw_super - down_write(&sb->s_umount); - __percpu_down_read(SB_FREEZE_FS) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- Documentation/ABI/testing/sysfs-fs-f2fs | 1 + fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 1 + fs/f2fs/inode.c | 6 ++++-- fs/f2fs/super.c | 4 ++++ 5 files changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index f627e705e663..48d41b669627 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -425,6 +425,7 @@ Description: Show status of f2fs superblock in real time. 0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted 0x2000 SBI_IS_RESIZEFS resizefs is in process + 0x4000 SBI_IS_FREEZING freefs is in process ====== ===================== ================================= What: /sys/fs/f2fs//ckpt_thread_ioprio diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 8c50518475a9..07ad0d81f0c5 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -338,6 +338,7 @@ static char *s_flag[] = { [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", [SBI_IS_RESIZEFS] = " resizefs", + [SBI_IS_FREEZING] = " freezefs", }; static int stat_show(struct seq_file *s, void *v) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c68817d83a53..0a0fa1a64d06 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1253,6 +1253,7 @@ enum { SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ SBI_IS_RESIZEFS, /* resizefs is in process */ + SBI_IS_FREEZING, /* freezefs is in process */ }; enum { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6488f5ff250c..2272000fb10b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -769,7 +769,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); - sb_start_intwrite(inode->i_sb); + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); retry: @@ -800,7 +801,8 @@ retry: if (dquot_initialize_needed(inode)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); } - sb_end_intwrite(inode->i_sb); + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_end_intwrite(inode->i_sb); no_delete: dquot_drop(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7b744ceb17a5..6dc66b7bc1f5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1663,11 +1663,15 @@ static int f2fs_freeze(struct super_block *sb) /* ensure no checkpoint required */ if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) return -EINVAL; + + /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ + set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } static int f2fs_unfreeze(struct super_block *sb) { + clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } -- cgit v1.2.3 From 40d006dfedd60ed8415bf8edec539be2d6c61662 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 12 Jan 2022 13:05:59 +0800 Subject: btrfs: harden identification of a stale device [ Upstream commit 770c79fb65506fc7c16459855c3839429f46cb32 ] Identifying and removing the stale device from the fs_uuids list is done by btrfs_free_stale_devices(). btrfs_free_stale_devices() in turn depends on device_path_matched() to check if the device appears in more than one btrfs_device structure. The matching of the device happens by its path, the device path. However, when device mapper is in use, the dm device paths are nothing but a link to the actual block device, which leads to the device_path_matched() failing to match. Fix this by matching the dev_t as provided by lookup_bdev() instead of plain string compare of the device paths. Reported-by: Josef Bacik Signed-off-by: Anand Jain Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/volumes.c | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 57021cc83811..f6d6825f218a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -530,15 +530,48 @@ error: return ret; } -static bool device_path_matched(const char *path, struct btrfs_device *device) +/* + * Check if the device in the path matches the device in the given struct device. + * + * Returns: + * true If it is the same device. + * false If it is not the same device or on error. + */ +static bool device_matched(const struct btrfs_device *device, const char *path) { - int found; + char *device_name; + dev_t dev_old; + dev_t dev_new; + int ret; + + /* + * If we are looking for a device with the matching dev_t, then skip + * device without a name (a missing device). + */ + if (!device->name) + return false; + + device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); + if (!device_name) + return false; rcu_read_lock(); - found = strcmp(rcu_str_deref(device->name), path); + scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name)); rcu_read_unlock(); - return found == 0; + ret = lookup_bdev(device_name, &dev_old); + kfree(device_name); + if (ret) + return false; + + ret = lookup_bdev(path, &dev_new); + if (ret) + return false; + + if (dev_old == dev_new) + return true; + + return false; } /* @@ -571,9 +604,7 @@ static int btrfs_free_stale_devices(const char *path, &fs_devices->devices, dev_list) { if (skip_device && skip_device == device) continue; - if (path && !device->name) - continue; - if (path && !device_path_matched(path, device)) + if (path && !device_matched(device, path)) continue; if (fs_devices->opened) { /* for an already deleted device return 0 */ -- cgit v1.2.3 From c78bada18aa11ee70a8e26e1972595539617425a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Feb 2022 10:03:22 -0500 Subject: btrfs: make search_csum_tree return 0 if we get -EFBIG [ Upstream commit 03ddb19d2ea745228879b9334f3b550c88acb10a ] We can either fail to find a csum entry at all and return -ENOENT, or we can find a range that is close, but return -EFBIG. In essence these both mean the same thing when we are doing a lookup for a csum in an existing range, we didn't find a csum. We want to treat both of these errors the same way, complain loudly that there wasn't a csum. This currently happens anyway because we do count = search_csum_tree(); if (count <= 0) { // reloc and error handling } However it forces us to incorrectly treat EIO or ENOMEM errors as on disk corruption. Fix this by returning 0 if we get either -ENOENT or -EFBIG from btrfs_lookup_csum() so we can do proper error handling. Reviewed-by: Boris Burkov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/file-item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 0b9401a5afd3..161a69d7e117 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -303,7 +303,7 @@ found: read_extent_buffer(path->nodes[0], dst, (unsigned long)item, ret * csum_size); out: - if (ret == -ENOENT) + if (ret == -ENOENT || ret == -EFBIG) ret = 0; return ret; } -- cgit v1.2.3 From 2eff60346e7ae1a24cd868b8fdcf58e946e7dde1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 7 Mar 2022 17:27:46 -0800 Subject: f2fs: use spin_lock to avoid hang [ Upstream commit 98237fcda4a24e67b0a4498c17d5aa4ad4537bc7 ] [14696.634553] task:cat state:D stack: 0 pid:1613738 ppid:1613735 flags:0x00000004 [14696.638285] Call Trace: [14696.639038] [14696.640032] __schedule+0x302/0x930 [14696.640969] schedule+0x58/0xd0 [14696.641799] schedule_preempt_disabled+0x18/0x30 [14696.642890] __mutex_lock.constprop.0+0x2fb/0x4f0 [14696.644035] ? mod_objcg_state+0x10c/0x310 [14696.645040] ? obj_cgroup_charge+0xe1/0x170 [14696.646067] __mutex_lock_slowpath+0x13/0x20 [14696.647126] mutex_lock+0x34/0x40 [14696.648070] stat_show+0x25/0x17c0 [f2fs] [14696.649218] seq_read_iter+0x120/0x4b0 [14696.650289] ? aa_file_perm+0x12a/0x500 [14696.651357] ? lru_cache_add+0x1c/0x20 [14696.652470] seq_read+0xfd/0x140 [14696.653445] full_proxy_read+0x5c/0x80 [14696.654535] vfs_read+0xa0/0x1a0 [14696.655497] ksys_read+0x67/0xe0 [14696.656502] __x64_sys_read+0x1a/0x20 [14696.657580] do_syscall_64+0x3b/0xc0 [14696.658671] entry_SYSCALL_64_after_hwframe+0x44/0xae [14696.660068] RIP: 0033:0x7efe39df1cb2 [14696.661133] RSP: 002b:00007ffc8badd948 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [14696.662958] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007efe39df1cb2 [14696.664757] RDX: 0000000000020000 RSI: 00007efe399df000 RDI: 0000000000000003 [14696.666542] RBP: 00007efe399df000 R08: 00007efe399de010 R09: 00007efe399de010 [14696.668363] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000000000 [14696.670155] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [14696.671965] [14696.672826] task:umount state:D stack: 0 pid:1614985 ppid:1614984 flags:0x00004000 [14696.674930] Call Trace: [14696.675903] [14696.676780] __schedule+0x302/0x930 [14696.677927] schedule+0x58/0xd0 [14696.679019] schedule_preempt_disabled+0x18/0x30 [14696.680412] __mutex_lock.constprop.0+0x2fb/0x4f0 [14696.681783] ? destroy_inode+0x65/0x80 [14696.683006] __mutex_lock_slowpath+0x13/0x20 [14696.684305] mutex_lock+0x34/0x40 [14696.685442] f2fs_destroy_stats+0x1e/0x60 [f2fs] [14696.686803] f2fs_put_super+0x158/0x390 [f2fs] [14696.688238] generic_shutdown_super+0x7a/0x120 [14696.689621] kill_block_super+0x27/0x50 [14696.690894] kill_f2fs_super+0x7f/0x100 [f2fs] [14696.692311] deactivate_locked_super+0x35/0xa0 [14696.693698] deactivate_super+0x40/0x50 [14696.694985] cleanup_mnt+0x139/0x190 [14696.696209] __cleanup_mnt+0x12/0x20 [14696.697390] task_work_run+0x64/0xa0 [14696.698587] exit_to_user_mode_prepare+0x1b7/0x1c0 [14696.700053] syscall_exit_to_user_mode+0x27/0x50 [14696.701418] do_syscall_64+0x48/0xc0 [14696.702630] entry_SYSCALL_64_after_hwframe+0x44/0xae Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/debug.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 07ad0d81f0c5..b449c7a372a4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,7 +21,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static DEFINE_MUTEX(f2fs_stat_mutex); +static DEFINE_RAW_SPINLOCK(f2fs_stat_lock); #ifdef CONFIG_DEBUG_FS static struct dentry *f2fs_debugfs_root; #endif @@ -345,8 +345,9 @@ static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; int i = 0, j = 0; + unsigned long flags; - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); @@ -574,7 +575,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); return 0; } @@ -585,6 +586,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + unsigned long flags; int i; si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); @@ -620,9 +622,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->max_aw_cnt, 0); atomic_set(&sbi->max_vw_cnt, 0); - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); return 0; } @@ -630,10 +632,11 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long flags; - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_del(&si->stat_list); - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); kfree(si); } -- cgit v1.2.3 From 1a55c48bba8161b417efa3a2580e56602d161d9d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Mar 2022 18:20:00 +0800 Subject: f2fs: compress: fix to print raw data size in error path of lz4 decompression [ Upstream commit d284af43f703760e261b1601378a0c13a19d5f1f ] In lz4_decompress_pages(), if size of decompressed data is not equal to expected one, we should print the size rather than size of target buffer for decompressed data, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/compress.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 58d255d3a518..6adf04725954 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -312,10 +312,9 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) } if (ret != PAGE_SIZE << dic->log_cluster_size) { - printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, " + printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, " "expected:%lu\n", KERN_ERR, - F2FS_I_SB(dic->inode)->sb->s_id, - dic->rlen, + F2FS_I_SB(dic->inode)->sb->s_id, ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; } -- cgit v1.2.3 From 9dd6bb11df64a2b57f494b775f892569c817d318 Mon Sep 17 00:00:00 2001 From: Rohith Surabattula Date: Mon, 7 Mar 2022 18:37:22 +0000 Subject: Adjust cifssb maximum read size [ Upstream commit 06a466565d54a1a42168f9033a062a3f5c40e73b ] When session gets reconnected during mount then read size in super block fs context gets set to zero and after negotiate, rsize is not modified which results in incorrect read with requested bytes as zero. Fixes intermittent failure of xfstest generic/240 Note that stable requires a different version of this patch which will be sent to the stable mailing list. Signed-off-by: Rohith Surabattula Acked-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/cifsfs.c | 3 +++ fs/cifs/file.c | 10 ++++++++++ 2 files changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 22a1d8156220..ed220daca3e1 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -210,6 +210,9 @@ cifs_read_super(struct super_block *sb) if (rc) goto out_no_root; /* tune readahead according to rsize if readahead size not set on mount */ + if (cifs_sb->ctx->rsize == 0) + cifs_sb->ctx->rsize = + tcon->ses->server->ops->negotiate_rsize(tcon, cifs_sb->ctx); if (cifs_sb->ctx->rasize) sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE; else diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 82bbaf8e92b7..b23f6b489bb9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3734,6 +3734,11 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, break; } + if (cifs_sb->ctx->rsize == 0) + cifs_sb->ctx->rsize = + server->ops->negotiate_rsize(tlink_tcon(open_file->tlink), + cifs_sb->ctx); + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, credits); if (rc) @@ -4512,6 +4517,11 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, break; } + if (cifs_sb->ctx->rsize == 0) + cifs_sb->ctx->rsize = + server->ops->negotiate_rsize(tlink_tcon(open_file->tlink), + cifs_sb->ctx); + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, credits); if (rc) -- cgit v1.2.3 From 24ab2d4ef52c2dbb62a60844b87fc8872383407a Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Tue, 22 Mar 2022 14:38:39 -0700 Subject: ntfs: add sanity check on allocation size [ Upstream commit 714fbf2647b1a33d914edd695d4da92029c7e7c0 ] ntfs_read_inode_mount invokes ntfs_malloc_nofs with zero allocation size. It triggers one BUG in the __ntfs_malloc function. Fix this by adding sanity check on ni->attr_list_size. Link: https://lkml.kernel.org/r/20220120094914.47736-1-dzm91@hust.edu.cn Reported-by: syzbot+3c765c5248797356edaa@syzkaller.appspotmail.com Signed-off-by: Dongliang Mu Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/ntfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 4474adb393ca..517b71c73aa9 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); + if (!ni->attr_list_size) { + ntfs_error(sb, "Attr_list_size is zero"); + goto put_err_out; + } ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); if (!ni->attr_list) { ntfs_error(sb, "Not enough memory to allocate buffer " -- cgit v1.2.3 From 2ad07009c459e56ebdcc089d850d664660fdb742 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:32 +0800 Subject: ubifs: rename_whiteout: Fix double free for whiteout_ui->data commit 40a8f0d5e7b3999f096570edab71c345da812e3e upstream. 'whiteout_ui->data' will be freed twice if space budget fail for rename whiteout operation as following process: rename_whiteout dev = kmalloc whiteout_ui->data = dev kfree(whiteout_ui->data) // Free first time iput(whiteout) ubifs_free_inode kfree(ui->data) // Double free! KASAN reports: ================================================================== BUG: KASAN: double-free or invalid-free in ubifs_free_inode+0x4f/0x70 Call Trace: kfree+0x117/0x490 ubifs_free_inode+0x4f/0x70 [ubifs] i_callback+0x30/0x60 rcu_do_batch+0x366/0xac0 __do_softirq+0x133/0x57f Allocated by task 1506: kmem_cache_alloc_trace+0x3c2/0x7a0 do_rename+0x9b7/0x1150 [ubifs] ubifs_rename+0x106/0x1f0 [ubifs] do_syscall_64+0x35/0x80 Freed by task 1506: kfree+0x117/0x490 do_rename.cold+0x53/0x8a [ubifs] ubifs_rename+0x106/0x1f0 [ubifs] do_syscall_64+0x35/0x80 The buggy address belongs to the object at ffff88810238bed8 which belongs to the cache kmalloc-8 of size 8 ================================================================== Let ubifs_free_inode() free 'whiteout_ui->data'. BTW, delete unused assignment 'whiteout_ui->data_len = 0', process 'ubifs_evict_inode() -> ubifs_jnl_delete_inode() -> ubifs_jnl_write_inode()' doesn't need it (because 'inc_nlink(whiteout)' won't be excuted by 'goto out_release', and the nlink of whiteout inode is 0). Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/dir.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 7c61d0ec0159..cfa8881d8cca 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1425,8 +1425,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, err = ubifs_budget_space(c, &wht_req); if (err) { - kfree(whiteout_ui->data); - whiteout_ui->data_len = 0; iput(whiteout); goto out_release; } -- cgit v1.2.3 From c58af8564a7b08757173009030b74baf4b2b762b Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:33 +0800 Subject: ubifs: Fix deadlock in concurrent rename whiteout and inode writeback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit afd427048047e8efdedab30e8888044e2be5aa9c upstream. Following hung tasks: [ 77.028764] task:kworker/u8:4 state:D stack: 0 pid: 132 [ 77.028820] Call Trace: [ 77.029027] schedule+0x8c/0x1b0 [ 77.029067] mutex_lock+0x50/0x60 [ 77.029074] ubifs_write_inode+0x68/0x1f0 [ubifs] [ 77.029117] __writeback_single_inode+0x43c/0x570 [ 77.029128] writeback_sb_inodes+0x259/0x740 [ 77.029148] wb_writeback+0x107/0x4d0 [ 77.029163] wb_workfn+0x162/0x7b0 [ 92.390442] task:aa state:D stack: 0 pid: 1506 [ 92.390448] Call Trace: [ 92.390458] schedule+0x8c/0x1b0 [ 92.390461] wb_wait_for_completion+0x82/0xd0 [ 92.390469] __writeback_inodes_sb_nr+0xb2/0x110 [ 92.390472] writeback_inodes_sb_nr+0x14/0x20 [ 92.390476] ubifs_budget_space+0x705/0xdd0 [ubifs] [ 92.390503] do_rename.cold+0x7f/0x187 [ubifs] [ 92.390549] ubifs_rename+0x8b/0x180 [ubifs] [ 92.390571] vfs_rename+0xdb2/0x1170 [ 92.390580] do_renameat2+0x554/0x770 , are caused by concurrent rename whiteout and inode writeback processes: rename_whiteout(Thread 1) wb_workfn(Thread2) ubifs_rename do_rename lock_4_inodes (Hold ui_mutex) ubifs_budget_space make_free_space shrink_liability __writeback_inodes_sb_nr bdi_split_work_to_wbs (Queue new wb work) wb_do_writeback(wb work) __writeback_single_inode ubifs_write_inode LOCK(ui_mutex) ↑ wb_wait_for_completion (Wait wb work) <-- deadlock! Reproducer (Detail program in [Link]): 1. SYS_renameat2("/mp/dir/file", "/mp/dir/whiteout", RENAME_WHITEOUT) 2. Consume out of space before kernel(mdelay) doing budget for whiteout Fix it by doing whiteout space budget before locking ubifs inodes. BTW, it also fixes wrong goto tag 'out_release' in whiteout budget error handling path(It should at least recover dir i_size and unlock 4 ubifs inodes). Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214733 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/dir.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index cfa8881d8cca..2735ad1affed 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1324,6 +1324,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (flags & RENAME_WHITEOUT) { union ubifs_dev_desc *dev = NULL; + struct ubifs_budget_req wht_req; dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!dev) { @@ -1345,6 +1346,20 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); + + memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); + wht_req.dirtied_ino = 1; + wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8); + /* + * To avoid deadlock between space budget (holds ui_mutex and + * waits wb work) and writeback work(waits ui_mutex), do space + * budget before ubifs inodes locked. + */ + err = ubifs_budget_space(c, &wht_req); + if (err) { + iput(whiteout); + goto out_release; + } } lock_4_inodes(old_dir, new_dir, new_inode, whiteout); @@ -1419,16 +1434,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, } if (whiteout) { - struct ubifs_budget_req wht_req = { .dirtied_ino = 1, - .dirtied_ino_d = \ - ALIGN(ubifs_inode(whiteout)->data_len, 8) }; - - err = ubifs_budget_space(c, &wht_req); - if (err) { - iput(whiteout); - goto out_release; - } - inc_nlink(whiteout); mark_inode_dirty(whiteout); -- cgit v1.2.3 From ff846f2c5d1de96f0af036eb9e3a7ecfeb1f1cd5 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:35 +0800 Subject: ubifs: Add missing iput if do_tmpfile() failed in rename whiteout commit 716b4573026bcbfa7b58ed19fe15554bac66b082 upstream. whiteout inode should be put when do_tmpfile() failed if inode has been initialized. Otherwise we will get following warning during umount: UBIFS error (ubi0:0 pid 1494): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth == 0, in fs/ubifs/super.c:1930 VFS: Busy inodes after unmount of ubifs. Self-destruct in 5 seconds. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng Suggested-by: Sascha Hauer Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 2735ad1affed..2cbc5f05f671 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -432,6 +432,8 @@ out_inode: make_bad_inode(inode); if (!instantiated) iput(inode); + else if (whiteout) + iput(*whiteout); out_budg: ubifs_release_budget(c, &req); if (!instantiated) -- cgit v1.2.3 From c67bc98d1f0853bb196e9c48eab38b6f2ddab795 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:36 +0800 Subject: ubifs: Rename whiteout atomically commit 278d9a243635f26c05ad95dcf9c5a593b9e04dc6 upstream. Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/dir.c | 144 ++++++++++++++++++++++++++++++++++------------------- fs/ubifs/journal.c | 52 ++++++++++++++++--- 2 files changed, 136 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 2cbc5f05f671..deaf2d5dba5b 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -349,8 +349,56 @@ out_budg: return err; } -static int do_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err; + umode_t mode = S_IFCHR | WHITEOUT_MODE; + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct fscrypt_name nm; + + /* + * Create an inode('nlink = 1') for whiteout without updating journal, + * let ubifs_jnl_rename() store it on flash to complete rename whiteout + * atomically. + */ + + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + return ERR_PTR(err); + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + /* The dir size is updated by do_rename. */ + insert_inode_hash(inode); + + return inode; + +out_inode: + make_bad_inode(inode); + iput(inode); +out_free: + fscrypt_free_filename(&nm); + ubifs_err(c, "cannot create whiteout file, error %d", err); + return ERR_PTR(err); +} + +static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; @@ -392,25 +440,13 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, } ui = ubifs_inode(inode); - if (whiteout) { - init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); - ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); - } - err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) goto out_inode; mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - - if (whiteout) { - mark_inode_dirty(inode); - drop_nlink(inode); - *whiteout = inode; - } else { - d_tmpfile(dentry, inode); - } + d_tmpfile(dentry, inode); ubifs_assert(c, ui->dirty); instantiated = 1; @@ -432,8 +468,6 @@ out_inode: make_bad_inode(inode); if (!instantiated) iput(inode); - else if (whiteout) - iput(*whiteout); out_budg: ubifs_release_budget(c, &req); if (!instantiated) @@ -443,12 +477,6 @@ out_budg: return err; } -static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) -{ - return do_tmpfile(dir, dentry, mode, NULL); -} - /** * vfs_dent_type - get VFS directory entry type. * @type: UBIFS directory entry type @@ -1266,17 +1294,19 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct ubifs_budget_req wht_req; struct timespec64 time; unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; /* - * Budget request settings: deletion direntry, new direntry, removing - * the old inode, and changing old and new parent directory inodes. + * Budget request settings: + * req: deletion direntry, new direntry, removing the old inode, + * and changing old and new parent directory inodes. * - * However, this operation also marks the target inode as dirty and - * does not write it, so we allocate budget for the target inode - * separately. + * wht_req: new whiteout inode for RENAME_WHITEOUT. + * + * ino_req: marks the target inode as dirty and does not write it. */ dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x", @@ -1326,7 +1356,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (flags & RENAME_WHITEOUT) { union ubifs_dev_desc *dev = NULL; - struct ubifs_budget_req wht_req; dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!dev) { @@ -1334,24 +1363,26 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_release; } - err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout); - if (err) { + /* + * The whiteout inode without dentry is pinned in memory, + * umount won't happen during rename process because we + * got parent dentry. + */ + whiteout = create_whiteout(old_dir, old_dentry); + if (IS_ERR(whiteout)) { + err = PTR_ERR(whiteout); kfree(dev); goto out_release; } - spin_lock(&whiteout->i_lock); - whiteout->i_state |= I_LINKABLE; - spin_unlock(&whiteout->i_lock); - whiteout_ui = ubifs_inode(whiteout); whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); - wht_req.dirtied_ino = 1; - wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8); + wht_req.new_ino = 1; + wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8); /* * To avoid deadlock between space budget (holds ui_mutex and * waits wb work) and writeback work(waits ui_mutex), do space @@ -1359,6 +1390,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, */ err = ubifs_budget_space(c, &wht_req); if (err) { + /* + * Whiteout inode can not be written on flash by + * ubifs_jnl_write_inode(), because it's neither + * dirty nor zero-nlink. + */ iput(whiteout); goto out_release; } @@ -1433,17 +1469,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); if (unlink && IS_SYNC(new_inode)) sync = 1; - } - - if (whiteout) { - inc_nlink(whiteout); - mark_inode_dirty(whiteout); - - spin_lock(&whiteout->i_lock); - whiteout->i_state &= ~I_LINKABLE; - spin_unlock(&whiteout->i_lock); - - iput(whiteout); + /* + * S_SYNC flag of whiteout inherits from the old_dir, and we + * have already checked the old dir inode. So there is no need + * to check whiteout. + */ } err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, @@ -1454,6 +1484,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &req); + if (whiteout) { + ubifs_release_budget(c, &wht_req); + iput(whiteout); + } + mutex_lock(&old_inode_ui->ui_mutex); release = old_inode_ui->dirty; mark_inode_dirty_sync(old_inode); @@ -1462,11 +1497,16 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + /* + * Rename finished here. Although old inode cannot be updated + * on flash, old ctime is not a big problem, don't return err + * code to userspace. + */ + old_inode->i_sb->s_op->write_inode(old_inode, NULL); fscrypt_free_filename(&old_nm); fscrypt_free_filename(&new_nm); - return err; + return 0; out_cancel: if (unlink) { @@ -1487,11 +1527,11 @@ out_cancel: inc_nlink(old_dir); } } + unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); if (whiteout) { - drop_nlink(whiteout); + ubifs_release_budget(c, &wht_req); iput(whiteout); } - unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); out_release: ubifs_release_budget(c, &ino_req); ubifs_release_budget(c, &req); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 8ea680dba61e..75dab0ae3939 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1207,9 +1207,9 @@ out_free: * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the re-name operation which may involve writing up - * to 4 inodes and 2 directory entries. It marks the written inodes as clean - * and returns zero on success. In case of failure, a negative error code is - * returned. + * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes) + * and 2 directory entries. It marks the written inodes as clean and returns + * zero on success. In case of failure, a negative error code is returned. */ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *old_inode, @@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, void *p; union ubifs_key key; struct ubifs_dent_node *dent, *dent2; - int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0; + int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0; int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); - struct ubifs_inode *new_ui; + struct ubifs_inode *new_ui, *whiteout_ui; u8 hash_old_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_inode[UBIFS_HASH_ARR_SZ]; + u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ]; u8 hash_dent1[UBIFS_HASH_ARR_SZ]; u8 hash_dent2[UBIFS_HASH_ARR_SZ]; @@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, } else ilen = 0; + if (whiteout) { + whiteout_ui = ubifs_inode(whiteout); + ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex)); + ubifs_assert(c, whiteout->i_nlink == 1); + ubifs_assert(c, !whiteout_ui->dirty); + wlen = UBIFS_INO_NODE_SZ; + wlen += whiteout_ui->data_len; + } else + wlen = 0; + aligned_dlen1 = ALIGN(dlen1, 8); aligned_dlen2 = ALIGN(dlen2, 8); - len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + + ALIGN(wlen, 8) + ALIGN(plen, 8); if (move) len += plen; @@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, p += ALIGN(ilen, 8); } + if (whiteout) { + pack_inode(c, p, whiteout, 0); + err = ubifs_node_calc_hash(c, p, hash_whiteout_inode); + if (err) + goto out_release; + + p += ALIGN(wlen, 8); + } + if (!move) { pack_inode(c, p, old_dir, 1); err = ubifs_node_calc_hash(c, p, hash_old_dir); @@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (new_inode) ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, new_inode->i_ino); + if (whiteout) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + whiteout->i_ino); } release_head(c, BASEHD); @@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm); if (err) goto out_ro; - - ubifs_delete_orphan(c, whiteout->i_ino); } else { err = ubifs_add_dirt(c, lnum, dlen2); if (err) @@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, offs += ALIGN(ilen, 8); } + if (whiteout) { + ino_key_init(c, &key, whiteout->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, wlen, + hash_whiteout_inode); + if (err) + goto out_ro; + offs += ALIGN(wlen, 8); + } + ino_key_init(c, &key, old_dir->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir); if (err) @@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, new_ui->synced_i_size = new_ui->ui_size; spin_unlock(&new_ui->ui_lock); } + /* + * No need to mark whiteout inode clean. + * Whiteout doesn't have non-zero size, no need to update + * synced_i_size for whiteout_ui. + */ mark_inode_clean(c, ubifs_inode(old_dir)); if (move) mark_inode_clean(c, ubifs_inode(new_dir)); -- cgit v1.2.3 From a9662bec5a4dfc87b1503e26aec1aadfdf521658 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:37 +0800 Subject: ubifs: Fix 'ui->dirty' race between do_tmpfile() and writeback work commit 60eb3b9c9f11206996f57cb89521824304b305ad upstream. 'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty', finally dirty space is released twice. open(O_TMPFILE) wb_workfn do_tmpfile ubifs_budget_space(ino_req = { .dirtied_ino = 1}) d_tmpfile // mark inode(tmpfile) dirty ubifs_jnl_update // without holding tmpfile's ui_mutex mark_inode_clean(ui) if (ui->dirty) ubifs_release_dirty_inode_budget(ui) // release first time ubifs_write_inode mutex_lock(&ui->ui_mutex) ubifs_release_dirty_inode_budget(ui) // release second time mutex_unlock(&ui->ui_mutex) ui->dirty = 0 Run generic/476 can reproduce following message easily (See reproducer in [Link]): UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554 UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to read-only mode, error -22 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: ubifs_ro_mode+0x54/0x60 [ubifs] ubifs_assert_failed+0x4b/0x80 [ubifs] ubifs_release_budget+0x468/0x5a0 [ubifs] ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs] ubifs_write_inode+0x121/0x1f0 [ubifs] ... wb_workfn+0x283/0x7b0 Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update(). Similar problem exists in whiteout renaming, but previous fix("ubifs: Rename whiteout atomically") has solved the problem. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/dir.c | 60 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index deaf2d5dba5b..ebdc9aa04cbb 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -397,6 +397,32 @@ out_free: return ERR_PTR(err); } +/** + * lock_2_inodes - a wrapper for locking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); +} + +/** + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -404,7 +430,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct ubifs_info *c = dir->i_sb->s_fs_info; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; - struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); + struct ubifs_inode *ui; int err, instantiated = 0; struct fscrypt_name nm; @@ -452,18 +478,18 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, instantiated = 1; mutex_unlock(&ui->ui_mutex); - mutex_lock(&dir_ui->ui_mutex); + lock_2_inodes(dir, inode); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); return 0; out_cancel: - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); out_inode: make_bad_inode(inode); if (!instantiated) @@ -690,32 +716,6 @@ static int ubifs_dir_release(struct inode *dir, struct file *file) return 0; } -/** - * lock_2_inodes - a wrapper for locking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - * - * We do not implement any tricks to guarantee strict lock ordering, because - * VFS has already done it for us on the @i_mutex. So this is just a simple - * wrapper function. - */ -static void lock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); -} - -/** - * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - */ -static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_unlock(&ubifs_inode(inode2)->ui_mutex); - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); -} - static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { -- cgit v1.2.3 From 489c3a2577b3f1bf7e41279d12a44c71e8527427 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:38 +0800 Subject: ubifs: Rectify space amount budget for mkdir/tmpfile operations commit a6dab6607d4681d227905d5198710b575dbdb519 upstream. UBIFS should make sure the flash has enough space to store dirty (Data that is newer than disk) data (in memory), space budget is exactly designed to do that. If space budget calculates less data than we need, 'make_reservation()' will do more work(return -ENOSPC if no free space lelf, sometimes we can see "cannot reserve xxx bytes in jhead xxx, error -28" in ubifs error messages) with ubifs inodes locked, which may effect other syscalls. A simple way to decide how much space do we need when make a budget: See how much space is needed by 'make_reservation()' in ubifs_jnl_xxx() function according to corresponding operation. It's better to report ENOSPC in ubifs_budget_space(), as early as we can. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Fixes: 1e51764a3c2ac05 ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/dir.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ebdc9aa04cbb..7ae48f273feb 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -428,15 +428,18 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; struct ubifs_inode *ui; int err, instantiated = 0; struct fscrypt_name nm; /* - * Budget request settings: new dirty inode, new direntry, - * budget for dirtied inode will be released via writeback. + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + * Allocate budget separately for new dirtied inode, the budget will + * be released via writeback. */ dbg_gen("dent '%pd', mode %#hx in dir ino %lu", @@ -979,7 +982,8 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct fscrypt_name nm; /* -- cgit v1.2.3 From 56cf8b26b18ecc7768a9918a98e4d7b279253675 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:39 +0800 Subject: ubifs: setflags: Make dirtied_ino_d 8 bytes aligned commit 1b83ec057db16b4d0697dc21ef7a9743b6041f72 upstream. Make 'ui->data_len' aligned with 8 bytes before it is assigned to dirtied_ino_d. Since 8871d84c8f8b0c6b("ubifs: convert to fileattr") applied, 'setflags()' only affects regular files and directories, only xattr inode, symlink inode and special inode(pipe/char_dev/block_dev) have none- zero 'ui->data_len' field, so assertion '!(req->dirtied_ino_d & 7)' cannot fail in ubifs_budget_space(). To avoid assertion fails in future evolution(eg. setflags can operate special inodes), it's better to make dirtied_ino_d 8 bytes aligned, after all aligned size is still zero for regular files. Fixes: 1e51764a3c2ac05a ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index c6a863487780..71bcebe45f9c 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -108,7 +108,7 @@ static int setflags(struct inode *inode, int flags) struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) -- cgit v1.2.3 From a7054aaf1909cf40489c0ec1b728fdcf79c751a6 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:40 +0800 Subject: ubifs: Fix read out-of-bounds in ubifs_wbuf_write_nolock() commit 4f2262a334641e05f645364d5ade1f565c85f20b upstream. Function ubifs_wbuf_write_nolock() may access buf out of bounds in following process: ubifs_wbuf_write_nolock(): aligned_len = ALIGN(len, 8); // Assume len = 4089, aligned_len = 4096 if (aligned_len <= wbuf->avail) ... // Not satisfy if (wbuf->used) { ubifs_leb_write() // Fill some data in avail wbuf len -= wbuf->avail; // len is still not 8-bytes aligned aligned_len -= wbuf->avail; } n = aligned_len >> c->max_write_shift; if (n) { n <<= c->max_write_shift; err = ubifs_leb_write(c, wbuf->lnum, buf + written, wbuf->offs, n); // n > len, read out of bounds less than 8(n-len) bytes } , which can be catched by KASAN: ========================================================= BUG: KASAN: slab-out-of-bounds in ecc_sw_hamming_calculate+0x1dc/0x7d0 Read of size 4 at addr ffff888105594ff8 by task kworker/u8:4/128 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: kasan_report.cold+0x81/0x165 nand_write_page_swecc+0xa9/0x160 ubifs_leb_write+0xf2/0x1b0 [ubifs] ubifs_wbuf_write_nolock+0x421/0x12c0 [ubifs] write_head+0xdc/0x1c0 [ubifs] ubifs_jnl_write_inode+0x627/0x960 [ubifs] wb_workfn+0x8af/0xb80 Function ubifs_wbuf_write_nolock() accepts that parameter 'len' is not 8 bytes aligned, the 'len' represents the true length of buf (which is allocated in 'ubifs_jnl_xxx', eg. ubifs_jnl_write_inode), so ubifs_wbuf_write_nolock() must handle the length read from 'buf' carefully to write leb safely. Fetch a reproducer in [Link]. Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214785 Reported-by: Chengsong Ke Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/io.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 00b61dba62b7..b019dd6f7fa0 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -833,16 +833,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->max_write_shift; + int m = n - 1; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); - err = ubifs_leb_write(c, wbuf->lnum, buf + written, - wbuf->offs, n); + + if (m) { + /* '(n-1)<max_write_shift < len' is always true. */ + m <<= c->max_write_shift; + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, m); + if (err) + goto out; + wbuf->offs += m; + aligned_len -= m; + len -= m; + written += m; + } + + /* + * The non-written len of buf may be less than 'n' because + * parameter 'len' is not 8 bytes aligned, so here we read + * min(len, n) bytes from buf. + */ + n = 1 << c->max_write_shift; + memcpy(wbuf->buf, buf + written, min(len, n)); + if (n > len) { + ubifs_assert(c, n - len < 8); + ubifs_pad(c, wbuf->buf + len, n - len); + } + + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n); if (err) goto out; wbuf->offs += n; aligned_len -= n; - len -= n; + len -= min(len, n); written += n; } -- cgit v1.2.3 From 4f75bab98565afd4f905059c56ec4caba88a7eec Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:41 +0800 Subject: ubifs: Fix to add refcount once page is set private commit 3b67db8a6ca83e6ff90b756d3da0c966f61cd37b upstream. MM defined the rule [1] very clearly that once page was set with PG_private flag, we should increment the refcount in that page, also main flows like pageout(), migrate_page() will assume there is one additional page reference count if page_has_private() returns true. Otherwise, we may get a BUG in page migration: page:0000000080d05b9d refcount:-1 mapcount:0 mapping:000000005f4d82a8 index:0xe2 pfn:0x14c12 aops:ubifs_file_address_operations [ubifs] ino:8f1 dentry name:"f30e" flags: 0x1fffff80002405(locked|uptodate|owner_priv_1|private|node=0| zone=1|lastcpupid=0x1fffff) page dumped because: VM_BUG_ON_PAGE(page_count(page) != 0) ------------[ cut here ]------------ kernel BUG at include/linux/page_ref.h:184! invalid opcode: 0000 [#1] SMP CPU: 3 PID: 38 Comm: kcompactd0 Not tainted 5.15.0-rc5 RIP: 0010:migrate_page_move_mapping+0xac3/0xe70 Call Trace: ubifs_migrate_page+0x22/0xc0 [ubifs] move_to_new_page+0xb4/0x600 migrate_pages+0x1523/0x1cc0 compact_zone+0x8c5/0x14b0 kcompactd+0x2bc/0x560 kthread+0x18c/0x1e0 ret_from_fork+0x1f/0x30 Before the time, we should make clean a concept, what does refcount means in page gotten from grab_cache_page_write_begin(). There are 2 situations: Situation 1: refcount is 3, page is created by __page_cache_alloc. TYPE_A - the write process is using this page TYPE_B - page is assigned to one certain mapping by calling __add_to_page_cache_locked() TYPE_C - page is added into pagevec list corresponding current cpu by calling lru_cache_add() Situation 2: refcount is 2, page is gotten from the mapping's tree TYPE_B - page has been assigned to one certain mapping TYPE_A - the write process is using this page (by calling page_cache_get_speculative()) Filesystem releases one refcount by calling put_page() in xxx_write_end(), the released refcount corresponds to TYPE_A (write task is using it). If there are any processes using a page, page migration process will skip the page by judging whether expected_page_refs() equals to page refcount. The BUG is caused by following process: PA(cpu 0) kcompactd(cpu 1) compact_zone ubifs_write_begin page_a = grab_cache_page_write_begin add_to_page_cache_lru lru_cache_add pagevec_add // put page into cpu 0's pagevec (refcnf = 3, for page creation process) ubifs_write_end SetPagePrivate(page_a) // doesn't increase page count ! unlock_page(page_a) put_page(page_a) // refcnt = 2 [...] PB(cpu 0) filemap_read filemap_get_pages add_to_page_cache_lru lru_cache_add __pagevec_lru_add // traverse all pages in cpu 0's pagevec __pagevec_lru_add_fn SetPageLRU(page_a) isolate_migratepages isolate_migratepages_block get_page_unless_zero(page_a) // refcnt = 3 list_add(page_a, from_list) migrate_pages(from_list) __unmap_and_move move_to_new_page ubifs_migrate_page(page_a) migrate_page_move_mapping expected_page_refs get 3 (migration[1] + mapping[1] + private[1]) release_pages put_page_testzero(page_a) // refcnt = 3 page_ref_freeze // refcnt = 0 page_ref_dec_and_test(0 - 1 = -1) page_ref_unfreeze VM_BUG_ON_PAGE(-1 != 0, page) UBIFS doesn't increase the page refcount after setting private flag, which leads to page migration task believes the page is not used by any other processes, so the page is migrated. This causes concurrent accessing on page refcount between put_page() called by other process(eg. read process calls lru_cache_add) and page_ref_unfreeze() called by migration task. Actually zhangjun has tried to fix this problem [2] by recalculating page refcnt in ubifs_migrate_page(). It's better to follow MM rules [1], because just like Kirill suggested in [2], we need to check all users of page_has_private() helper. Like f2fs does in [3], fix it by adding/deleting refcount when setting/clearing private for a page. BTW, according to [4], we set 'page->private' as 1 because ubifs just simply SetPagePrivate(). And, [5] provided a common helper to set/clear page private, ubifs can use this helper following the example of iomap, afs, btrfs, etc. Jump [6] to find a reproducer. [1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com [2] https://www.spinics.net/lists/linux-mtd/msg04018.html [3] http://lkml.iu.edu/hypermail/linux/kernel/1903.0/03313.html [4] https://lore.kernel.org/linux-f2fs-devel/20210422154705.GO3596236@casper.infradead.org [5] https://lore.kernel.org/all/20200517214718.468-1-guoqing.jiang@cloud.ionos.com [6] https://bugzilla.kernel.org/show_bug.cgi?id=214961 Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5cfa28cd00cd..6b45a037a047 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } if (!PagePrivate(page)) { - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } @@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len) release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); kunmap(page); @@ -1304,7 +1304,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset, release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); } @@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct address_space *mapping, return rc; if (PagePrivate(page)) { - ClearPagePrivate(page); - SetPagePrivate(newpage); + detach_page_private(page); + attach_page_private(newpage, (void *)1); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) return 0; ubifs_assert(c, PagePrivate(page)); ubifs_assert(c, 0); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); return 1; } @@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) else { if (!PageChecked(page)) ubifs_convert_page_budget(c); - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } -- cgit v1.2.3 From 578bf41d9443bc662bc0d15a479025271d20983c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 15 Feb 2022 12:07:36 +0800 Subject: ubifs: rename_whiteout: correct old_dir size computing commit 705757274599e2e064dd3054aabc74e8af31a095 upstream. When renaming the whiteout file, the old whiteout file is not deleted. Therefore, we add the old dentry size to the old dir like XFS. Otherwise, an error may be reported due to `fscki->calc_sz != fscki->size` in check_indes. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Reported-by: Zhihao Cheng Signed-off-by: Baokun Li Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/dir.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 7ae48f273feb..79e371bc15e1 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1402,6 +1402,9 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, iput(whiteout); goto out_release; } + + /* Add the old_dentry size to the old_dir size. */ + old_sz -= CALC_DENT_SIZE(fname_len(&old_nm)); } lock_4_inodes(old_dir, new_dir, new_inode, whiteout); -- cgit v1.2.3 From 1d8195349742e62ca45b01a0030d9c3b4f3dc685 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 10 Dec 2021 14:43:36 +0100 Subject: gfs2: gfs2_setattr_size error path fix commit 7336905a89f19173bf9301cd50a24421162f417c upstream. When gfs2_setattr_size() fails, it calls gfs2_rs_delete(ip, NULL) to get rid of any reservations the inode may have. Instead, it should pass in the inode's write count as the second parameter to allow gfs2_rs_delete() to figure out if the inode has any writers left. In a next step, there are two instances of gfs2_rs_delete(ip, NULL) left where we know that there can be no other users of the inode. Replace those with gfs2_rs_deltree(&ip->i_res) to avoid the unnecessary write count check. With that, gfs2_rs_delete() is only called with the inode's actual write count, so get rid of the second parameter. Fixes: a097dc7e24cb ("GFS2: Make rgrp reservations part of the gfs2_inode structure") Signed-off-by: Andreas Gruenbacher Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 2 +- fs/gfs2/inode.c | 2 +- fs/gfs2/rgrp.c | 7 ++++--- fs/gfs2/rgrp.h | 2 +- fs/gfs2/super.c | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index fba32141a651..bb9014ced702 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2204,7 +2204,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) ret = do_shrink(inode, newsize); out: - gfs2_rs_delete(ip, NULL); + gfs2_rs_delete(ip); gfs2_qa_put(ip); return ret; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f6d3b3e13d72..1c8b747072cb 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -713,7 +713,7 @@ static int gfs2_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) { if (gfs2_rs_active(&ip->i_res)) - gfs2_rs_delete(ip, &inode->i_writecount); + gfs2_rs_delete(ip); gfs2_qa_put(ip); } return 0; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3130f85d2b3f..97ee17843b4d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -811,7 +811,7 @@ fail_free_inode: if (free_vfs_inode) /* else evict will do the put for us */ gfs2_glock_put(ip->i_gl); } - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_qa_put(ip); fail_free_acls: posix_acl_release(default_acl); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c3b00ba92ed2..b8e363610402 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -680,13 +680,14 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) /** * gfs2_rs_delete - delete a multi-block reservation * @ip: The inode for this reservation - * @wcount: The inode's write count, or NULL * */ -void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) +void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if ((wcount == NULL) || (atomic_read(wcount) <= 1)) + if (atomic_read(&inode->i_writecount) <= 1) gfs2_rs_deltree(&ip->i_res); up_write(&ip->i_rw_mutex); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index a6855fd796e0..2f80f3bbf876 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -45,7 +45,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index cc51b5f5f52d..0f2e0530dd43 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1398,7 +1398,7 @@ out: truncate_inode_pages_final(&inode->i_data); if (ip->i_qadata) gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); -- cgit v1.2.3 From 5c3c9bce1c99bf58efdf1c7af870240777ca64b5 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Tue, 22 Mar 2022 19:05:51 +0000 Subject: gfs2: Make sure FITRIM minlen is rounded up to fs block size commit 27ca8273fda398638ca994a207323a85b6d81190 upstream. Per fstrim(8) we must round up the minlen argument to the fs block size. The current calculation doesn't take into account devices that have a discard granularity and requested minlen less than 1 fs block, so the value can get shifted away to zero in the translation to fs blocks. The zero minlen passed to gfs2_rgrp_send_discards() then allows sb_issue_discard() to be called with nr_sects == 0 which returns -EINVAL and results in gfs2_rgrp_send_discards() returning -EIO. Make sure minlen is never < 1 fs block by taking the max of the requested minlen and the fs block size before comparing to the device's discard granularity and shifting to fs blocks. Fixes: 076f0faa764ab ("GFS2: Fix FITRIM argument handling") Signed-off-by: Andrew Price Signed-off-by: Andreas Gruenbacher Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/rgrp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index b8e363610402..403cf6f1eb8c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1428,7 +1428,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp) start = r.start >> bs_shift; end = start + (r.len >> bs_shift); - minlen = max_t(u64, r.minlen, + minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize); + minlen = max_t(u64, minlen, q->limits.discard_granularity) >> bs_shift; if (end <= start || minlen > sdp->sd_max_rg_data) -- cgit v1.2.3 From 0853bd6885c2f293d88aaa7f7f1702c959b31680 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 25 Mar 2022 16:36:31 +0000 Subject: io_uring: fix memory leak of uid in files registration commit c86d18f4aa93e0e66cda0e55827cd03eea6bc5f8 upstream. When there are no files for __io_sqe_files_scm() to process in the range, it'll free everything and return. However, it forgets to put uid. Fixes: 08a451739a9b5 ("io_uring: allow sparse fixed file sets") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/accee442376f33ce8aaebb099d04967533efde92.1648226048.git.asml.silence@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ec0b50940405..5fc3a62eae72 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8130,6 +8130,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fput(fpl->fp[i]); } else { kfree_skb(skb); + free_uid(fpl->user); kfree(fpl); } -- cgit v1.2.3 From 5b422da35c15a3835be731c98c6dbfb9c5072d41 Mon Sep 17 00:00:00 2001 From: Lv Ruyi Date: Tue, 29 Mar 2022 10:40:04 +0000 Subject: proc: bootconfig: Add null pointer check commit bed5b60bf67ccd8957b8c0558fead30c4a3f5d3f upstream. kzalloc is a memory allocation function which can return NULL when some internal memory errors happen. It is safer to add null pointer check. Link: https://lkml.kernel.org/r/20220329104004.2376879-1-lv.ruyi@zte.com.cn Cc: stable@vger.kernel.org Fixes: c1a3c36017d4 ("proc: bootconfig: Add /proc/bootconfig to show boot config list") Acked-by: Masami Hiramatsu Reported-by: Zeal Robot Signed-off-by: Lv Ruyi Signed-off-by: Steven Rostedt (Google) Signed-off-by: Greg Kroah-Hartman --- fs/proc/bootconfig.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 6d8d4bf20837..2e244ada1f97 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -32,6 +32,8 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) int ret = 0; key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL); + if (!key) + return -ENOMEM; xbc_for_each_key_value(leaf, val) { ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX); -- cgit v1.2.3 From f6ca862806df3762170cd3251852330304e781c9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 8 Mar 2022 12:55:29 -0600 Subject: coredump: Snapshot the vmas in do_coredump commit 95c5436a4883841588dae86fb0b9325f47ba5ad3 upstream. Move the call of dump_vma_snapshot and kvfree(vma_meta) out of the individual coredump routines into do_coredump itself. This makes the code less error prone and easier to maintain. Make the vma snapshot available to the coredump routines in struct coredump_params. This makes it easier to change and update what is captures in the vma snapshot and will be needed for fixing fill_file_notes. Reviewed-by: Jann Horn Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_elf.c | 20 +++++++------------- fs/binfmt_elf_fdpic.c | 18 ++++++------------ fs/coredump.c | 41 +++++++++++++++++++++++------------------ include/linux/binfmts.h | 3 +++ include/linux/coredump.h | 3 --- 5 files changed, 39 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3f6a7cac68fd..bd6189f55ddb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2168,8 +2168,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs, i; - size_t vma_data_size; + int segs, i; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2177,16 +2176,12 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - struct core_vma_metadata *vma_meta; - - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - return 0; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2225,7 +2220,7 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2245,8 +2240,8 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; @@ -2283,8 +2278,8 @@ static int elf_core_dump(struct coredump_params *cprm) /* Align to page */ dump_skip_to(cprm, dataoff); - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; @@ -2301,7 +2296,6 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_meta); kfree(phdr4note); return has_dumped; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 6d8fd6030cbb..830a6a876ffe 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm, static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs; + int segs; int i; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; @@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - struct core_vma_metadata *vma_meta = NULL; - size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1491,9 +1489,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - goto end_coredump; - for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; size_t sz; @@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) dump_skip_to(cprm, dataoff); - if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) + if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1652,7 +1647,6 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } - kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/coredump.c b/fs/coredump.c index bb66a4e3819f..64e2ce897420 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -53,6 +53,8 @@ #include +static bool dump_vma_snapshot(struct coredump_params *cprm); + int core_uses_pid; unsigned int core_pipe_limit; char core_pattern[CORENAME_MAX_SIZE] = "core"; @@ -601,6 +603,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) * by any locks. */ .mm_flags = mm->flags, + .vma_meta = NULL, }; audit_core_dumps(siginfo->si_signo); @@ -815,6 +818,9 @@ void do_coredump(const kernel_siginfo_t *siginfo) pr_info("Core dump to |%s disabled\n", cn.corename); goto close_fail; } + if (!dump_vma_snapshot(&cprm)) + goto close_fail; + file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); /* @@ -828,6 +834,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) dump_emit(&cprm, "", 1); } file_end_write(cprm.file); + kvfree(cprm.vma_meta); } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -1108,14 +1115,11 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, * Under the mmap_lock, take a snapshot of relevant information about the task's * VMAs. */ -int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, - struct core_vma_metadata **vma_meta, - size_t *vma_data_size_ptr) +static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *vma, *gate_vma; struct mm_struct *mm = current->mm; int i; - size_t vma_data_size = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1123,20 +1127,21 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, * mmap_lock in read mode. */ if (mmap_write_lock_killable(mm)) - return -EINTR; + return false; + cprm->vma_data_size = 0; gate_vma = get_gate_vma(mm); - *vma_count = mm->map_count + (gate_vma ? 1 : 0); + cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0); - *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); - if (!*vma_meta) { + cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL); + if (!cprm->vma_meta) { mmap_write_unlock(mm); - return -ENOMEM; + return false; } for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma), i++) { - struct core_vma_metadata *m = (*vma_meta) + i; + struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; m->end = vma->vm_end; @@ -1146,13 +1151,14 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, mmap_write_unlock(mm); - if (WARN_ON(i != *vma_count)) { - kvfree(*vma_meta); - return -EFAULT; + if (WARN_ON(i != cprm->vma_count)) { + kvfree(cprm->vma_meta); + return false; } - for (i = 0; i < *vma_count; i++) { - struct core_vma_metadata *m = (*vma_meta) + i; + + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = cprm->vma_meta + i; if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { char elfmag[SELFMAG]; @@ -1165,9 +1171,8 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, } } - vma_data_size += m->dump_size; + cprm->vma_data_size += m->dump_size; } - *vma_data_size_ptr = vma_data_size; - return 0; + return true; } diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 049cf9421d83..f821b7243361 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -87,6 +87,9 @@ struct coredump_params { loff_t written; loff_t pos; loff_t to_skip; + int vma_count; + size_t vma_data_size; + struct core_vma_metadata *vma_meta; }; /* diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 78fcd776b185..cc1aee2cb46a 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -29,9 +29,6 @@ extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); -int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, - struct core_vma_metadata **vma_meta, - size_t *vma_data_size_ptr); extern void do_coredump(const kernel_siginfo_t *siginfo); #else static inline void do_coredump(const kernel_siginfo_t *siginfo) {} -- cgit v1.2.3 From 7ba958df64493aa8fb3af315db5095711a0b3589 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 8 Mar 2022 13:01:19 -0600 Subject: coredump: Remove the WARN_ON in dump_vma_snapshot commit 49c1866348f364478a0c4d3dd13fd08bb82d3a5b upstream. The condition is impossible and to the best of my knowledge has never triggered. We are in deep trouble if that conditions happens and we walk past the end of our allocated array. So delete the WARN_ON and the code that makes it look like the kernel can handle the case of walking past the end of it's vma_meta array. Reviewed-by: Jann Horn Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/coredump.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 64e2ce897420..e95fea3beaeb 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1151,12 +1151,6 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) mmap_write_unlock(mm); - if (WARN_ON(i != cprm->vma_count)) { - kvfree(cprm->vma_meta); - return false; - } - - for (i = 0; i < cprm->vma_count; i++) { struct core_vma_metadata *m = cprm->vma_meta + i; -- cgit v1.2.3 From cabd69640957f029b476bc2d74ebf24ac67da6bb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 31 Jan 2022 12:17:38 -0600 Subject: coredump/elf: Pass coredump_params into fill_note_info commit 9ec7d3230717b4fe9b6c7afeb4811909c23fa1d7 upstream. Instead of individually passing cprm->siginfo and cprm->regs into fill_note_info pass all of struct coredump_params. This is preparation to allow fill_files_note to use the existing vma snapshot. Reviewed-by: Jann Horn Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_elf.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index bd6189f55ddb..1ae3fe3eddf1 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1799,7 +1799,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct task_struct *dump_task = current; const struct user_regset_view *view = task_user_regset_view(dump_task); @@ -1871,7 +1871,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Now fill in each thread's information. */ for (t = info->thread; t != NULL; t = t->next) - if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size)) + if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, &info->size)) return 0; /* @@ -1880,7 +1880,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); info->size += notesize(&info->psinfo); - fill_siginfo_note(&info->signote, &info->csigdata, siginfo); + fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo); info->size += notesize(&info->signote); fill_auxv_note(&info->auxv, current->mm); @@ -2028,7 +2028,7 @@ static int elf_note_info_init(struct elf_note_info *info) static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct core_thread *ct; struct elf_thread_status *ets; @@ -2049,13 +2049,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, list_for_each_entry(ets, &info->thread_list, list) { int sz; - sz = elf_dump_thread_status(siginfo->si_signo, ets); + sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets); info->thread_status_size += sz; } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(&info->prstatus->common, current, siginfo->si_signo); - elf_core_copy_regs(&info->prstatus->pr_reg, regs); + fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo); + elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs); /* Set up header */ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); @@ -2071,7 +2071,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_note(info->notes + 1, "CORE", NT_PRPSINFO, sizeof(*info->psinfo), info->psinfo); - fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo); + fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo); fill_auxv_note(info->notes + 3, current->mm); info->numnote = 4; @@ -2081,8 +2081,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, } /* Try to dump the FPU. */ - info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, - info->fpu); + info->prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, cprm->regs, info->fpu); if (info->prstatus->pr_fpvalid) fill_note(info->notes + info->numnote++, "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); @@ -2195,7 +2195,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + if (!fill_note_info(&elf, e_phnum, &info, cprm)) goto end_coredump; has_dumped = 1; -- cgit v1.2.3 From 39fd0cc079c98dafcf355997ada7b5e67f0bb10a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 8 Mar 2022 13:04:19 -0600 Subject: coredump: Use the vma snapshot in fill_files_note commit 390031c942116d4733310f0684beb8db19885fe6 upstream. Matthew Wilcox reported that there is a missing mmap_lock in file_files_note that could possibly lead to a user after free. Solve this by using the existing vma snapshot for consistency and to avoid the need to take the mmap_lock anywhere in the coredump code except for dump_vma_snapshot. Update the dump_vma_snapshot to capture vm_pgoff and vm_file that are neeeded by fill_files_note. Add free_vma_snapshot to free the captured values of vm_file. Reported-by: Matthew Wilcox Link: https://lkml.kernel.org/r/20220131153740.2396974-1-willy@infradead.org Cc: stable@vger.kernel.org Fixes: a07279c9a8cd ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot") Fixes: 2aa362c49c31 ("coredump: extend core dump note section to contain file names of mapped files") Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_elf.c | 24 ++++++++++++------------ fs/coredump.c | 22 +++++++++++++++++++++- include/linux/coredump.h | 2 ++ 3 files changed, 35 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1ae3fe3eddf1..c93150f36a52 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1618,17 +1618,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, * long file_ofs * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... */ -static int fill_files_note(struct memelfnote *note) +static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; user_long_t *start_end_ofs; char *name_base, *name_curpos; + int i; /* *Estimated* file count and total data size needed */ - count = mm->map_count; + count = cprm->vma_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1650,11 +1649,12 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = &cprm->vma_meta[i]; struct file *file; const char *filename; - file = vma->vm_file; + file = m->file; if (!file) continue; filename = file_path(file, name_curpos, remaining); @@ -1674,9 +1674,9 @@ static int fill_files_note(struct memelfnote *note) memmove(name_curpos, filename, n); name_curpos += n; - *start_end_ofs++ = vma->vm_start; - *start_end_ofs++ = vma->vm_end; - *start_end_ofs++ = vma->vm_pgoff; + *start_end_ofs++ = m->start; + *start_end_ofs++ = m->end; + *start_end_ofs++ = m->pgoff; count++; } @@ -1687,7 +1687,7 @@ static int fill_files_note(struct memelfnote *note) * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = mm->map_count - count; + n = cprm->vma_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -1886,7 +1886,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_auxv_note(&info->auxv, current->mm); info->size += notesize(&info->auxv); - if (fill_files_note(&info->files) == 0) + if (fill_files_note(&info->files, cprm) == 0) info->size += notesize(&info->files); return 1; @@ -2075,7 +2075,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_auxv_note(info->notes + 3, current->mm); info->numnote = 4; - if (fill_files_note(info->notes + info->numnote) == 0) { + if (fill_files_note(info->notes + info->numnote, cprm) == 0) { info->notes_files = info->notes + info->numnote; info->numnote++; } diff --git a/fs/coredump.c b/fs/coredump.c index e95fea3beaeb..26eb5a095832 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -54,6 +54,7 @@ #include static bool dump_vma_snapshot(struct coredump_params *cprm); +static void free_vma_snapshot(struct coredump_params *cprm); int core_uses_pid; unsigned int core_pipe_limit; @@ -834,7 +835,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) dump_emit(&cprm, "", 1); } file_end_write(cprm.file); - kvfree(cprm.vma_meta); + free_vma_snapshot(&cprm); } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -1111,6 +1112,20 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, return gate_vma; } +static void free_vma_snapshot(struct coredump_params *cprm) +{ + if (cprm->vma_meta) { + int i; + for (i = 0; i < cprm->vma_count; i++) { + struct file *file = cprm->vma_meta[i].file; + if (file) + fput(file); + } + kvfree(cprm->vma_meta); + cprm->vma_meta = NULL; + } +} + /* * Under the mmap_lock, take a snapshot of relevant information about the task's * VMAs. @@ -1147,6 +1162,11 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); + m->pgoff = vma->vm_pgoff; + + m->file = vma->vm_file; + if (m->file) + get_file(m->file); } mmap_write_unlock(mm); diff --git a/include/linux/coredump.h b/include/linux/coredump.h index cc1aee2cb46a..4b95e46d215f 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -12,6 +12,8 @@ struct core_vma_metadata { unsigned long start, end; unsigned long flags; unsigned long dump_size; + unsigned long pgoff; + struct file *file; }; extern int core_uses_pid; -- cgit v1.2.3 From 3ae7163598c611a8892698e0c056fe794e52b44b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 2 Mar 2022 14:51:53 +0800 Subject: ceph: fix inode reference leakage in ceph_get_snapdir() [ Upstream commit 322794d3355c33adcc4feace0045d85a8e4ed813 ] The ceph_get_inode() will search for or insert a new inode into the hash for the given vino, and return a reference to it. If new is non-NULL, its reference is consumed. We should release the reference when in error handing cases. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov Signed-off-by: Sasha Levin --- fs/ceph/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1c7574105478..42e449d3f18b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent) if (!S_ISDIR(parent->i_mode)) { pr_warn_once("bad snapdir parent type (mode=0%o)\n", parent->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { pr_warn_once("bad snapdir inode type (mode=0%o)\n", inode->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } inode->i_mode = parent->i_mode; @@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent) } return inode; +err: + if ((inode->i_state & I_NEW)) + discard_new_inode(inode); + else + iput(inode); + return ERR_PTR(-ENOTDIR); } const struct inode_operations ceph_file_iops = { -- cgit v1.2.3 From 2fe82d3254029ef9ec4e7be890125d5ef4f537de Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Sat, 5 Mar 2022 19:52:59 +0800 Subject: ceph: fix memory leak in ceph_readdir when note_last_dentry returns error [ Upstream commit f639d9867eea647005dc824e0e24f39ffc50d4e4 ] Reset the last_readdir at the same time, and add a comment explaining why we don't free last_readdir when dir_emit returns false. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov Signed-off-by: Sasha Levin --- fs/ceph/dir.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 133dbd9338e7..d91fa53e12b3 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -478,8 +478,11 @@ more: 2 : (fpos_off(rde->offset) + 1); err = note_last_dentry(dfi, rde->name, rde->name_len, next_offset); - if (err) + if (err) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; return err; + } } else if (req->r_reply_info.dir_end) { dfi->next_offset = 2; /* keep last name */ @@ -520,6 +523,12 @@ more: if (!dir_emit(ctx, rde->name, rde->name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), le32_to_cpu(rde->inode.in->mode) >> 12)) { + /* + * NOTE: Here no need to put the 'dfi->last_readdir', + * because when dir_emit stops us it's most likely + * doesn't have enough memory, etc. So for next readdir + * it will continue. + */ dout("filldir stopping us...\n"); return 0; } -- cgit v1.2.3 From 31e027259ce48cfa06700ad5a1a377768b56bf8b Mon Sep 17 00:00:00 2001 From: Qinghua Jin Date: Wed, 23 Mar 2022 16:06:23 -0700 Subject: minix: fix bug when opening a file with O_DIRECT [ Upstream commit 9ce3c0d26c42d279b6c378a03cd6a61d828f19ca ] Testcase: 1. create a minix file system and mount it 2. open a file on the file system with O_RDWR|O_CREAT|O_TRUNC|O_DIRECT 3. open fails with -EINVAL but leaves an empty file behind. All other open() failures don't leave the failed open files behind. It is hard to check the direct_IO op before creating the inode. Just as ext4 and btrfs do, this patch will resolve the issue by allowing to create the file with O_DIRECT but returning error when writing the file. Link: https://lkml.kernel.org/r/20220107133626.413379-1-qhjin.dev@gmail.com Signed-off-by: Qinghua Jin Reported-by: Colin Ian King Reviewed-by: Jan Kara Acked-by: Christian Brauner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/minix/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/minix/inode.c b/fs/minix/inode.c index a71f1cf894b9..d4bd94234ef7 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -447,7 +447,8 @@ static const struct address_space_operations minix_aops = { .writepage = minix_writepage, .write_begin = minix_write_begin, .write_end = generic_write_end, - .bmap = minix_bmap + .bmap = minix_bmap, + .direct_IO = noop_direct_IO }; static const struct inode_operations minix_symlink_inode_operations = { -- cgit v1.2.3 From b37f482ba9f0e6382c188e3fccf6c4b2fdc938eb Mon Sep 17 00:00:00 2001 From: Xin Xiong Date: Tue, 25 Jan 2022 21:10:45 +0800 Subject: NFSv4.2: fix reference count leaks in _nfs42_proc_copy_notify() [ Upstream commit b7f114edd54326f730a754547e7cfb197b5bc132 ] [You don't often get email from xiongx18@fudan.edu.cn. Learn why this is important at http://aka.ms/LearnAboutSenderIdentification.] The reference counting issue happens in two error paths in the function _nfs42_proc_copy_notify(). In both error paths, the function simply returns the error code and forgets to balance the refcount of object `ctx`, bumped by get_nfs_open_context() earlier, which may cause refcount leaks. Fix it by balancing refcount of the `ctx` object before the function returns in both error paths. Signed-off-by: Xin Xiong Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/nfs42proc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 9865b5c37d88..93f4d8257525 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -586,8 +586,10 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, ctx = get_nfs_open_context(nfs_file_open_context(src)); l_ctx = nfs_get_lock_context(ctx); - if (IS_ERR(l_ctx)) - return PTR_ERR(l_ctx); + if (IS_ERR(l_ctx)) { + status = PTR_ERR(l_ctx); + goto out; + } status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx, FMODE_READ); @@ -595,7 +597,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, if (status) { if (status == -EAGAIN) status = -NFS4ERR_BAD_STATEID; - return status; + goto out; } status = nfs4_call_sync(src_server->client, src_server, &msg, @@ -603,6 +605,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, if (status == -ENOTSUPP) src_server->caps &= ~NFS_CAP_COPY_NOTIFY; +out: put_nfs_open_context(nfs_file_open_context(src)); return status; } -- cgit v1.2.3 From a34752aa23975196fb38c5caf98485f258eef3eb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 29 Jan 2022 13:32:45 -0500 Subject: NFSv4: Protect the state recovery thread against direct reclaim [ Upstream commit 3e17898aca293a24dae757a440a50aa63ca29671 ] If memory allocation triggers a direct reclaim from the state recovery thread, then we can deadlock. Use memalloc_nofs_save/restore to ensure that doesn't happen. Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/nfs4state.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 51f5cb41e87a..57ea63e2cdb4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -2559,9 +2560,17 @@ static void nfs4_layoutreturn_any_run(struct nfs_client *clp) static void nfs4_state_manager(struct nfs_client *clp) { + unsigned int memflags; int status = 0; const char *section = "", *section_sep = ""; + /* + * State recovery can deadlock if the direct reclaim code tries + * start NFS writeback. So ensure memory allocations are all + * GFP_NOFS. + */ + memflags = memalloc_nofs_save(); + /* Ensure exclusive access to NFSv4 state */ do { trace_nfs4_state_mgr(clp); @@ -2656,6 +2665,7 @@ static void nfs4_state_manager(struct nfs_client *clp) clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); } + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); @@ -2673,6 +2683,7 @@ static void nfs4_state_manager(struct nfs_client *clp) return; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; + memflags = memalloc_nofs_save(); } while (refcount_read(&clp->cl_count) > 1 && !signalled()); goto out_drain; @@ -2685,6 +2696,7 @@ out_error: clp->cl_hostname, -status); ssleep(1); out_drain: + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } -- cgit v1.2.3 From 24d28d9b0fd5c544a56804cda4458cf6b53afb84 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 7 Mar 2022 10:41:44 +1100 Subject: NFS: swap IO handling is slightly different for O_DIRECT IO [ Upstream commit 64158668ac8b31626a8ce48db4cad08496eb8340 ] 1/ Taking the i_rwsem for swap IO triggers lockdep warnings regarding possible deadlocks with "fs_reclaim". These deadlocks could, I believe, eventuate if a buffered read on the swapfile was attempted. We don't need coherence with the page cache for a swap file, and buffered writes are forbidden anyway. There is no other need for i_rwsem during direct IO. So never take it for swap_rw() 2/ generic_write_checks() explicitly forbids writes to swap, and performs checks that are not needed for swap. So bypass it for swap_rw(). Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/direct.c | 42 ++++++++++++++++++++++++++++-------------- fs/nfs/file.c | 4 ++-- include/linux/nfs_fs.h | 8 ++++---- 3 files changed, 34 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 3c0335c15a73..28afc315ec0c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -172,8 +172,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter); - return nfs_file_direct_write(iocb, iter); + return nfs_file_direct_read(iocb, iter, true); + return nfs_file_direct_write(iocb, iter, true); } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -424,6 +424,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -439,7 +440,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -481,12 +483,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (iter_is_iovec(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - nfs_start_io_direct(inode); + if (!swap) + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - nfs_end_io_direct(inode); + if (!swap) + nfs_end_io_direct(inode); if (requested > 0) { result = nfs_direct_wait(dreq); @@ -875,6 +879,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -891,7 +896,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { ssize_t result, requested; size_t count; @@ -905,7 +911,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - result = generic_write_checks(iocb, iter); + if (swap) + /* bypass generic checks */ + result = iov_iter_count(iter); + else + result = generic_write_checks(iocb, iter); if (result <= 0) return result; count = result; @@ -936,16 +946,20 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dreq->iocb = iocb; pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); - nfs_start_io_direct(inode); + if (swap) { + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + } else { + nfs_start_io_direct(inode); - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + } - nfs_end_io_direct(inode); + nfs_end_io_direct(inode); + } if (requested > 0) { result = nfs_direct_wait(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index aa353fd58240..42a16993913a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -161,7 +161,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to); + return nfs_file_direct_read(iocb, to, false); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -616,7 +616,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_write(iocb, from); + return nfs_file_direct_write(iocb, from, false); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, iov_iter_count(from), (long long) iocb->ki_pos); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4a733f140939..41102e03512f 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -494,10 +494,10 @@ static inline const struct cred *nfs_file_cred(struct file *file) * linux/fs/nfs/direct.c */ extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *); -extern ssize_t nfs_file_direct_read(struct kiocb *iocb, - struct iov_iter *iter); -extern ssize_t nfs_file_direct_write(struct kiocb *iocb, - struct iov_iter *iter); +ssize_t nfs_file_direct_read(struct kiocb *iocb, + struct iov_iter *iter, bool swap); +ssize_t nfs_file_direct_write(struct kiocb *iocb, + struct iov_iter *iter, bool swap); /* * linux/fs/nfs/dir.c -- cgit v1.2.3 From 6bb2270223a8128efdcaf1654ba416f693eb4623 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 7 Mar 2022 10:41:44 +1100 Subject: NFS: swap-out must always use STABLE writes. [ Upstream commit c265de257f558a05c1859ee9e3fed04883b9ec0e ] The commit handling code is not safe against memory-pressure deadlocks when writing to swap. In particular, nfs_commitdata_alloc() blocks indefinitely waiting for memory, and this can consume all available workqueue threads. swap-out most likely uses STABLE writes anyway as COND_STABLE indicates that a stable write should be used if the write fits in a single request, and it normally does. However if we ever swap with a small wsize, or gather unusually large numbers of pages for a single write, this might change. For safety, make it explicit in the code that direct writes used for swap must always use FLUSH_STABLE. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/direct.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 28afc315ec0c..c220810c61d1 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -793,7 +793,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { */ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct iov_iter *iter, - loff_t pos) + loff_t pos, int ioflags) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; @@ -801,7 +801,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); - nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, + nfs_pageio_init_write(&desc, inode, ioflags, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); @@ -947,11 +947,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); if (swap) { - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_STABLE); } else { nfs_start_io_direct(inode); - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_COND_STABLE); if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, -- cgit v1.2.3 From d925b7e78b62805fcc5440d1521181c82b6f03cb Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Tue, 22 Mar 2022 21:59:17 +0800 Subject: jfs: prevent NULL deref in diFree [ Upstream commit a53046291020ec41e09181396c1e829287b48d47 ] Add validation check for JFS_IP(ipimap)->i_imap to prevent a NULL deref in diFree since diFree uses it without do any validations. When function jfs_mount calls diMount to initialize fileset inode allocation map, it can fail and JFS_IP(ipimap)->i_imap won't be initialized. Then it calls diFreeSpecial to close fileset inode allocation map inode and it will flow into jfs_evict_inode. Function jfs_evict_inode just validates JFS_SBI(inode->i_sb)->ipimap, then calls diFree. diFree use JFS_IP(ipimap)->i_imap directly, then it will cause a NULL deref. Reported-by: TCS Robot Signed-off-by: Haimin Zhang Signed-off-by: Dave Kleikamp Signed-off-by: Sasha Levin --- fs/jfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 57ab424c05ff..072821b50ab9 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -146,12 +146,13 @@ void jfs_evict_inode(struct inode *inode) dquot_initialize(inode); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { + struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap; truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); - if (JFS_SBI(inode->i_sb)->ipimap) + if (ipimap && JFS_IP(ipimap)->i_imap) diFree(inode); /* -- cgit v1.2.3 From da747de6859948e75bcaaa1019ef0f7fdaf0b867 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Mar 2022 12:34:19 -0400 Subject: NFS: nfsiod should not block forever in mempool_alloc() [ Upstream commit 515dcdcd48736576c6f5c197814da6f81c60a21e ] The concern is that since nfsiod is sometimes required to kick off a commit, it can get locked up waiting forever in mempool_alloc() instead of failing gracefully and leaving the commit until later. Try to allocate from the slab first, with GFP_KERNEL | __GFP_NORETRY, then fall back to a non-blocking attempt to allocate from the memory pool. Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/internal.h | 7 +++++++ fs/nfs/pnfs_nfs.c | 8 ++++++-- fs/nfs/write.c | 24 +++++++++--------------- include/linux/nfs_fs.h | 2 +- 4 files changed, 23 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 66fc936834f2..7239118d98a3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -580,6 +580,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf, !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } +static inline gfp_t nfs_io_gfp_mask(void) +{ + if (current->flags & PF_WQ_WORKER) + return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + return GFP_KERNEL; +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 316f68f96e57..657c242a18ff 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -419,7 +419,7 @@ static struct nfs_commit_data * pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo) { - struct nfs_commit_data *data = nfs_commitdata_alloc(false); + struct nfs_commit_data *data = nfs_commitdata_alloc(); if (!data) return NULL; @@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(mds_pages, NULL, cinfo, -1); + return -ENOMEM; + } data->ds_commit_index = -1; list_splice_init(mds_pages, &data->pages); list_add_tail(&data->list, &list); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0691b0b02147..a296e504e4aa 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) +struct nfs_commit_data *nfs_commitdata_alloc(void) { struct nfs_commit_data *p; - if (never_fail) - p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); - else { - /* It is OK to do some reclaim, not no safe to wait - * for anything to be returned to the pool. - * mempool_alloc() cannot handle that particular combination, - * so we need two separate attempts. - */ + p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask()); + if (!p) { p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); - if (!p) - p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | - __GFP_NOWARN | __GFP_NORETRY); if (!p) return NULL; + memset(p, 0, sizeof(*p)); } - - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); return p; } @@ -1809,7 +1799,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(head, NULL, cinfo, -1); + return -ENOMEM; + } /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 41102e03512f..5ffcde9ac413 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -567,7 +567,7 @@ extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page *page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); -extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); +extern struct nfs_commit_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_commit_data *data); bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); -- cgit v1.2.3 From ea029e4ce760f786919d06ef52efa2e50ea92a5f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Mar 2022 13:48:36 -0400 Subject: NFS: Avoid writeback threads getting stuck in mempool_alloc() [ Upstream commit 0bae835b63c53f86cdc524f5962e39409585b22c ] In a low memory situation, allow the NFS writeback code to fail without getting stuck in infinite loops in mempool_alloc(). Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/pagelist.c | 10 +++++----- fs/nfs/write.c | 10 ++++++++-- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b1130dc200d2..f2fe23e6c51f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -90,10 +90,10 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) } } -static inline struct nfs_page * -nfs_page_alloc(void) +static inline struct nfs_page *nfs_page_alloc(void) { - struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); + struct nfs_page *p = + kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask()); if (p) INIT_LIST_HEAD(&p->wb_list); return p; @@ -901,7 +901,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_commit_info cinfo; struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; - gfp_t gfp_flags = GFP_KERNEL; + gfp_t gfp_flags = nfs_io_gfp_mask(); pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); pg_array->npages = pagecount; @@ -988,7 +988,7 @@ nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc, desc->pg_mirrors_dynamic = NULL; if (mirror_count == 1) return desc->pg_mirrors_static; - ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL); + ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask()); if (ret != NULL) { for (i = 0; i < mirror_count; i++) nfs_pageio_mirror_init(&ret[i], desc->pg_bsize); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index a296e504e4aa..d21b25511499 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -94,9 +94,15 @@ EXPORT_SYMBOL_GPL(nfs_commit_free); static struct nfs_pgio_header *nfs_writehdr_alloc(void) { - struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL); + struct nfs_pgio_header *p; - memset(p, 0, sizeof(*p)); + p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask()); + if (!p) { + p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT); + if (!p) + return NULL; + memset(p, 0, sizeof(*p)); + } p->rw_mode = FMODE_WRITE; return p; } -- cgit v1.2.3 From 9f0c217469e84e71f7423bce64527c0b56bf9228 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Tue, 29 Mar 2022 19:32:07 +0800 Subject: Revert "NFSv4: Handle the special Linux file open access mode" [ Upstream commit ab0fc21bc7105b54bafd85bd8b82742f9e68898a ] This reverts commit 44942b4e457beda00981f616402a1a791e8c616e. After secondly opening a file with O_ACCMODE|O_DIRECT flags, nfs4_valid_open_stateid() will dereference NULL nfs4_state when lseek(). Reproducer: 1. mount -t nfs -o vers=4.2 $server_ip:/ /mnt/ 2. fd = open("/mnt/file", O_ACCMODE|O_DIRECT|O_CREAT) 3. close(fd) 4. fd = open("/mnt/file", O_ACCMODE|O_DIRECT) 5. lseek(fd) Reported-by: Lyu Tao Signed-off-by: ChenXiaoSong Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/inode.c | 1 - fs/nfs/nfs4file.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 410f87bc48cc..f4f75db7a825 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1167,7 +1167,6 @@ int nfs_open(struct inode *inode, struct file *filp) nfs_fscache_open_file(inode, filp); return 0; } -EXPORT_SYMBOL_GPL(nfs_open); /* * This function is called whenever some part of NFS notices that diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index c91565227ea2..8f35b5e13e93 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -51,7 +51,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) return err; if ((openflags & O_ACCMODE) == 3) - return nfs_open(inode, filp); + openflags--; /* We can't create new files here */ openflags &= ~(O_CREAT|O_EXCL); -- cgit v1.2.3 From 6f52d4cda0bf77ed53b6d0cd1339ddba46551b98 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Tue, 29 Mar 2022 19:32:08 +0800 Subject: NFSv4: fix open failure with O_ACCMODE flag [ Upstream commit b243874f6f9568b2daf1a00e9222cacdc15e159c ] open() with O_ACCMODE|O_DIRECT flags secondly will fail. Reproducer: 1. mount -t nfs -o vers=4.2 $server_ip:/ /mnt/ 2. fd = open("/mnt/file", O_ACCMODE|O_DIRECT|O_CREAT) 3. close(fd) 4. fd = open("/mnt/file", O_ACCMODE|O_DIRECT) Server nfsd4_decode_share_access() will fail with error nfserr_bad_xdr when client use incorrect share access mode of 0. Fix this by using NFS4_SHARE_ACCESS_BOTH share access mode in client, just like firstly opening. Fixes: ce4ef7c0a8a05 ("NFS: Split out NFS v4 file operations") Signed-off-by: ChenXiaoSong Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/dir.c | 10 ---------- fs/nfs/internal.h | 10 ++++++++++ fs/nfs/nfs4file.c | 6 ++++-- 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 9adc6f57a008..78219396788b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1835,16 +1835,6 @@ const struct dentry_operations nfs4_dentry_operations = { }; EXPORT_SYMBOL_GPL(nfs4_dentry_operations); -static fmode_t flags_to_mode(int flags) -{ - fmode_t res = (__force fmode_t)flags & FMODE_EXEC; - if ((flags & O_ACCMODE) != O_WRONLY) - res |= FMODE_READ; - if ((flags & O_ACCMODE) != O_RDONLY) - res |= FMODE_WRITE; - return res; -} - static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp) { return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7239118d98a3..c8845242d422 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -42,6 +42,16 @@ static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) return true; } +static inline fmode_t flags_to_mode(int flags) +{ + fmode_t res = (__force fmode_t)flags & FMODE_EXEC; + if ((flags & O_ACCMODE) != O_WRONLY) + res |= FMODE_READ; + if ((flags & O_ACCMODE) != O_RDONLY) + res |= FMODE_WRITE; + return res; +} + /* * Note: RFC 1813 doesn't limit the number of auth flavors that * a server can return, so make something up. diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 8f35b5e13e93..4120e1cb3fee 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -32,6 +32,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct dentry *parent = NULL; struct inode *dir; unsigned openflags = filp->f_flags; + fmode_t f_mode; struct iattr attr; int err; @@ -50,8 +51,9 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (err) return err; + f_mode = filp->f_mode; if ((openflags & O_ACCMODE) == 3) - openflags--; + f_mode |= flags_to_mode(openflags); /* We can't create new files here */ openflags &= ~(O_CREAT|O_EXCL); @@ -59,7 +61,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); + ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; -- cgit v1.2.3 From 50c981bd77900293c32f91b3d28344554a5f3c0b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 6 Apr 2022 12:43:57 +0100 Subject: io_uring: nospec index for tags on files update [ Upstream commit 34bb77184123ae401100a4d156584f12fa630e5c ] Don't forget to array_index_nospec() for indexes before updating rsrc tags in __io_sqe_files_update(), just use already safe and precalculated index @i. Fixes: c3bdad0271834 ("io_uring: add generic rsrc update with tags") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5fc3a62eae72..21823d1e91de 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8589,7 +8589,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, up->offset + done) = tag; + *io_get_tag_slot(data, i) = tag; io_fixed_file_set(file_slot, file); err = io_sqe_file_register(ctx, file, i); if (err) { -- cgit v1.2.3 From 285f5d724005758a8b215fa0a9ef3e282d646edb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 6 Apr 2022 12:43:58 +0100 Subject: io_uring: don't touch scm_fp_list after queueing skb [ Upstream commit a07211e3001435fe8591b992464cd8d5e3c98c5a ] It's safer to not touch scm_fp_list after we queued an skb to which it was assigned, there might be races lurking if we screw subtle sync guarantees on the io_uring side. Fixes: 6b06314c47e14 ("io_uring: add file set registration") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 21823d1e91de..2faa558778ba 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8126,8 +8126,12 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); - for (i = 0; i < nr_files; i++) - fput(fpl->fp[i]); + for (i = 0; i < nr; i++) { + struct file *file = io_file_from_index(ctx, i + offset); + + if (file) + fput(file); + } } else { kfree_skb(skb); free_uid(fpl->user); -- cgit v1.2.3 From 056e1e7c17136cea36267383a43d586509566931 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2022 11:06:02 -0600 Subject: io_uring: don't check req->file in io_fsync_prep() commit ec858afda857e361182ceafc3d2ba2b164b8e889 upstream. This is a leftover from the really old days where we weren't able to track and error early if we need a file and it wasn't assigned. Kill the check. Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2faa558778ba..e291d5e1e749 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4128,9 +4128,6 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - if (!req->file) - return -EBADF; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || -- cgit v1.2.3 From ae6cba337cbfda855b028f1e6ae83c22de4f5212 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 29 Mar 2022 10:59:20 -0600 Subject: io_uring: defer splice/tee file validity check until command issue commit a3e4bc23d5470b2beb7cc42a86b6a3e75b704c15 upstream. In preparation for not using the file at prep time, defer checking if this file refers to a valid io_uring instance until issue time. This also means we can get rid of the cleanup flag for splice and tee. Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index e291d5e1e749..6b71072f1408 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -623,10 +623,10 @@ struct io_epoll { struct io_splice { struct file *file_out; - struct file *file_in; loff_t off_out; loff_t off_in; u64 len; + int splice_fd_in; unsigned int flags; }; @@ -1452,14 +1452,6 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - - switch (req->opcode) { - case IORING_OP_SPLICE: - case IORING_OP_TEE: - if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) - req->work.flags |= IO_WQ_WORK_UNBOUND; - break; - } } static void io_prep_async_link(struct io_kiocb *req) @@ -4027,18 +4019,11 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - sp->file_in = NULL; sp->len = READ_ONCE(sqe->len); sp->flags = READ_ONCE(sqe->splice_flags); - if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - - sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), - (sp->flags & SPLICE_F_FD_IN_FIXED)); - if (!sp->file_in) - return -EBADF; - req->flags |= REQ_F_NEED_CLEANUP; + sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); return 0; } @@ -4053,20 +4038,27 @@ static int io_tee_prep(struct io_kiocb *req, static int io_tee(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = &req->splice; - struct file *in = sp->file_in; struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + struct file *in; long ret = 0; if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; + + in = io_file_get(req->ctx, req, sp->splice_fd_in, + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (!in) { + ret = -EBADF; + goto done; + } + if (sp->len) ret = do_tee(in, out, sp->len, flags); if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) io_put_file(in); - req->flags &= ~REQ_F_NEED_CLEANUP; - +done: if (ret != sp->len) req_set_fail(req); io_req_complete(req, ret); @@ -4085,15 +4077,22 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_splice(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = &req->splice; - struct file *in = sp->file_in; struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; loff_t *poff_in, *poff_out; + struct file *in; long ret = 0; if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; + in = io_file_get(req->ctx, req, sp->splice_fd_in, + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (!in) { + ret = -EBADF; + goto done; + } + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; @@ -4102,8 +4101,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) io_put_file(in); - req->flags &= ~REQ_F_NEED_CLEANUP; - +done: if (ret != sp->len) req_set_fail(req); io_req_complete(req, ret); @@ -6649,11 +6647,6 @@ static void io_clean_op(struct io_kiocb *req) kfree(io->free_iov); break; } - case IORING_OP_SPLICE: - case IORING_OP_TEE: - if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(req->splice.file_in); - break; case IORING_OP_OPENAT: case IORING_OP_OPENAT2: if (req->open.filename) -- cgit v1.2.3 From fe223dd2f18b30dea66cee1174908a934e942e33 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Wed, 6 Apr 2022 13:55:33 +0200 Subject: io_uring: implement compat handling for IORING_REGISTER_IOWQ_AFF commit 0f5e4b83b37a96e3643951588ed7176b9b187c0a upstream. Similarly to the way it is done im mbind syscall. Cc: stable@vger.kernel.org # 5.14 Fixes: fe76421d1da1dcdb ("io_uring: allow user configurable IO thread CPU affinity") Signed-off-by: Eugene Syromiatnikov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6b71072f1408..a3a175965929 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10683,7 +10683,15 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, if (len > cpumask_size()) len = cpumask_size(); - if (copy_from_user(new_mask, arg, len)) { + if (in_compat_syscall()) { + ret = compat_get_bitmap(cpumask_bits(new_mask), + (const compat_ulong_t __user *)arg, + len * 8 /* CHAR_BIT */); + } else { + ret = copy_from_user(new_mask, arg, len); + } + + if (ret) { free_cpumask_var(new_mask); return -EFAULT; } -- cgit v1.2.3 From ba7261af2b030ab2c06189be1fc77b273716839f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 8 Apr 2022 11:08:58 -0600 Subject: io_uring: fix race between timeout flush and removal commit e677edbcabee849bfdd43f1602bccbecf736a646 upstream. io_flush_timeouts() assumes the timeout isn't in progress of triggering or being removed/canceled, so it unconditionally removes it from the timeout list and attempts to cancel it. Leave it on the list and let the normal timeout cancelation take care of it. Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index a3a175965929..3d44d48b35ea 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1538,12 +1538,11 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + struct io_kiocb *req, *tmp; spin_lock_irq(&ctx->timeout_lock); - while (!list_empty(&ctx->timeout_list)) { + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { u32 events_needed, events_got; - struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; @@ -1560,7 +1559,6 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - list_del_init(&req->timeout.list); io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; @@ -6205,6 +6203,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; + INIT_LIST_HEAD(&req->timeout.list); data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); -- cgit v1.2.3 From 4b98799e181b4326a613108cf37acc1f55d21b45 Mon Sep 17 00:00:00 2001 From: Ethan Lien Date: Mon, 7 Mar 2022 18:00:04 +0800 Subject: btrfs: fix qgroup reserve overflow the qgroup limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b642b52d0b50f4d398cb4293f64992d0eed2e2ce upstream. We use extent_changeset->bytes_changed in qgroup_reserve_data() to record how many bytes we set for EXTENT_QGROUP_RESERVED state. Currently the bytes_changed is set as "unsigned int", and it will overflow if we try to fallocate a range larger than 4GiB. The result is we reserve less bytes and eventually break the qgroup limit. Unlike regular buffered/direct write, which we use one changeset for each ordered extent, which can never be larger than 256M. For fallocate, we use one changeset for the whole range, thus it no longer respects the 256M per extent limit, and caused the problem. The following example test script reproduces the problem: $ cat qgroup-overflow.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT # Set qgroup limit to 2GiB. btrfs quota enable $MNT btrfs qgroup limit 2G $MNT # Try to fallocate a 3GiB file. This should fail. echo echo "Try to fallocate a 3GiB file..." fallocate -l 3G $MNT/3G.file # Try to fallocate a 5GiB file. echo echo "Try to fallocate a 5GiB file..." fallocate -l 5G $MNT/5G.file # See we break the qgroup limit. echo sync btrfs qgroup show -r $MNT umount $MNT When running the test: $ ./qgroup-overflow.sh (...) Try to fallocate a 3GiB file... fallocate: fallocate failed: Disk quota exceeded Try to fallocate a 5GiB file... qgroupid         rfer         excl     max_rfer --------         ----         ----     -------- 0/5           5.00GiB      5.00GiB      2.00GiB Since we have no control of how bytes_changed is used, it's better to set it to u64. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Qu Wenruo Signed-off-by: Ethan Lien Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 53abdc280451..f7ab6ba8238e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -117,7 +117,7 @@ struct btrfs_bio_ctrl { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - unsigned int bytes_changed; + u64 bytes_changed; /* Changed ranges */ struct ulist range_changed; -- cgit v1.2.3 From 887366faf0c9046611795086759afcfb1de2d7c7 Mon Sep 17 00:00:00 2001 From: Kaiwen Hu Date: Wed, 23 Mar 2022 15:10:32 +0800 Subject: btrfs: prevent subvol with swapfile from being deleted commit 60021bd754c6ca0addc6817994f20290a321d8d6 upstream. A subvolume with an active swapfile must not be deleted otherwise it would not be possible to deactivate it. After the subvolume is deleted, we cannot swapoff the swapfile in this deleted subvolume because the path is unreachable. The swapfile is still active and holding references, the filesystem cannot be unmounted. The test looks like this: mkfs.btrfs -f $dev > /dev/null mount $dev $mnt btrfs sub create $mnt/subvol touch $mnt/subvol/swapfile chmod 600 $mnt/subvol/swapfile chattr +C $mnt/subvol/swapfile dd if=/dev/zero of=$mnt/subvol/swapfile bs=1K count=4096 mkswap $mnt/subvol/swapfile swapon $mnt/subvol/swapfile btrfs sub delete $mnt/subvol swapoff $mnt/subvol/swapfile # failed: No such file or directory swapoff --all unmount $mnt # target is busy. To prevent above issue, we simply check that whether the subvolume contains any active swapfile, and stop the deleting process. This behavior is like snapshot ioctl dealing with a swapfile. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Robbie Ko Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Kaiwen Hu Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 58053b5f0ce1..e478fd916216 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4450,6 +4450,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); return -EPERM; } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + return -EPERM; + } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); @@ -10721,8 +10728,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); -- cgit v1.2.3 From 6308ab54c8ecf29c7f6bd5b262ef1511f52bee84 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 27 Sep 2021 15:21:43 +0800 Subject: btrfs: remove unused parameter nr_pages in add_ra_bio_pages() commit cd9255be6980012ad54f2d4fd3941bc2586e43e5 upstream. Variable @nr_pages only gets increased but never used. Remove it. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba Cc: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/compression.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0913ee50e6c3..701fbd1b5676 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -550,7 +550,6 @@ static noinline int add_ra_bio_pages(struct inode *inode, u64 isize = i_size_read(inode); int ret; struct page *page; - unsigned long nr_pages = 0; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; @@ -646,7 +645,6 @@ static noinline int add_ra_bio_pages(struct inode *inode, PAGE_SIZE, 0); if (ret == PAGE_SIZE) { - nr_pages++; put_page(page); } else { unlock_extent(tree, last_offset, end); -- cgit v1.2.3 From ed0e951463eaffe9f90bff6c2c69f080e9e0ad0a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 3 Feb 2022 15:36:44 +0000 Subject: btrfs: remove no longer used counter when reading data page commit ad3fc7946b1829213bbdbb2b9ad0d124b31ae4a7 upstream. After commit 92082d40976ed0 ("btrfs: integrate page status update for data read path into begin/end_page_read"), the 'nr' counter at btrfs_do_readpage() is no longer used, we increment it but we never read from it. So just remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba Cc: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 50a5cc4fd25b..96aeb16bd65c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3561,7 +3561,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 cur_end; struct extent_map *em; int ret = 0; - int nr = 0; size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; @@ -3727,9 +3726,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, end_bio_extent_readpage, 0, this_bio_flag, force_bio_submit); - if (!ret) { - nr++; - } else { + if (ret) { unlock_extent(tree, cur, cur + iosize - 1); end_page_read(page, false, cur, iosize); goto out; -- cgit v1.2.3 From ec13aa4e0085eba99fab07f9842a27b727cf50cd Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 24 Mar 2022 08:36:45 -0700 Subject: btrfs: remove unused variable in btrfs_{start,write}_dirty_block_groups() commit 6d4a6b515c39f1f8763093e0f828959b2fbc2f45 upstream. Clang's version of -Wunused-but-set-variable recently gained support for unary operations, which reveals two unused variables: fs/btrfs/block-group.c:2949:6: error: variable 'num_started' set but not used [-Werror,-Wunused-but-set-variable] int num_started = 0; ^ fs/btrfs/block-group.c:3116:6: error: variable 'num_started' set but not used [-Werror,-Wunused-but-set-variable] int num_started = 0; ^ 2 errors generated. These variables appear to be unused from their introduction, so just remove them to silence the warnings. Fixes: c9dc4c657850 ("Btrfs: two stage dirty block group writeout") Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout outside critical section in commit") CC: stable@vger.kernel.org # 5.4+ Link: https://github.com/ClangBuiltLinux/linux/issues/1614 Signed-off-by: Nathan Chancellor Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/block-group.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 53ff3db9640e..37418b183b07 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2882,7 +2882,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); @@ -2948,7 +2947,6 @@ again: cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; /* @@ -3049,7 +3047,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3107,7 +3104,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; list_add_tail(&cache->io_list, io); } else { -- cgit v1.2.3 From 07cacfd9d9dc134557ac8866c73d570a59b3d1f3 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Mon, 28 Mar 2022 21:32:05 +0900 Subject: btrfs: release correct delalloc amount in direct IO write path commit 6d82ad13c4110e73c7b0392f00534a1502a1b520 upstream. Running generic/406 causes the following WARNING in btrfs_destroy_inode() which tells there are outstanding extents left. In btrfs_get_blocks_direct_write(), we reserve a temporary outstanding extents with btrfs_delalloc_reserve_metadata() (or indirectly from btrfs_delalloc_reserve_space(()). We then release the outstanding extents with btrfs_delalloc_release_extents(). However, the "len" can be modified in the COW case, which releases fewer outstanding extents than expected. Fix it by calling btrfs_delalloc_release_extents() for the original length. To reproduce the warning, the filesystem should be 1 GiB. It's triggering a short-write, due to not being able to allocate a large extent and instead allocating a smaller one. WARNING: CPU: 0 PID: 757 at fs/btrfs/inode.c:8848 btrfs_destroy_inode+0x1e6/0x210 [btrfs] Modules linked in: btrfs blake2b_generic xor lzo_compress lzo_decompress raid6_pq zstd zstd_decompress zstd_compress xxhash zram zsmalloc CPU: 0 PID: 757 Comm: umount Not tainted 5.17.0-rc8+ #101 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS d55cb5a 04/01/2014 RIP: 0010:btrfs_destroy_inode+0x1e6/0x210 [btrfs] RSP: 0018:ffffc9000327bda8 EFLAGS: 00010206 RAX: 0000000000000000 RBX: ffff888100548b78 RCX: 0000000000000000 RDX: 0000000000026900 RSI: 0000000000000000 RDI: ffff888100548b78 RBP: ffff888100548940 R08: 0000000000000000 R09: ffff88810b48aba8 R10: 0000000000000001 R11: ffff8881004eb240 R12: ffff88810b48a800 R13: ffff88810b48ec08 R14: ffff88810b48ed00 R15: ffff888100490c68 FS: 00007f8549ea0b80(0000) GS:ffff888237c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f854a09e733 CR3: 000000010a2e9003 CR4: 0000000000370eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: destroy_inode+0x33/0x70 dispose_list+0x43/0x60 evict_inodes+0x161/0x1b0 generic_shutdown_super+0x2d/0x110 kill_anon_super+0xf/0x20 btrfs_kill_super+0xd/0x20 [btrfs] deactivate_locked_super+0x27/0x90 cleanup_mnt+0x12c/0x180 task_work_run+0x54/0x80 exit_to_user_mode_prepare+0x152/0x160 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x42/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f854a000fb7 Fixes: f0bfa76a11e9 ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range") CC: stable@vger.kernel.org # 5.17 Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e478fd916216..7465ef012e22 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7772,6 +7772,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, u64 block_start, orig_start, orig_block_len, ram_bytes; bool can_nocow = false; bool space_reserved = false; + u64 prev_len; int ret = 0; /* @@ -7799,6 +7800,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, can_nocow = true; } + prev_len = len; if (can_nocow) { struct extent_map *em2; @@ -7828,8 +7830,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, goto out; } } else { - const u64 prev_len = len; - /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; @@ -7860,7 +7860,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * We have created our ordered extent, so we can now release our reservation * for an outstanding extent. */ - btrfs_delalloc_release_extents(BTRFS_I(inode), len); + btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); /* * Need to update the i_size under the extent lock so buffered -- cgit v1.2.3 From 675e7d3086d0aa360b6ce856ce55cd003589f9ad Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 1 Apr 2022 06:25:17 +0000 Subject: cifs: release cached dentries only if mount is complete [ Upstream commit d788e51636462e61c6883f7d96b07b06bc291650 ] During cifs_kill_sb, we first dput all the dentries that we have cached. However this function can also get called for mount failures. So dput the cached dentries only if the filesystem mount is complete. i.e. cifs_sb->root is populated. Fixes: 5e9c89d43fa6 ("cifs: Grab a reference for the dentry of the cached directory during the lifetime of the cache") Signed-off-by: Shyam Prasad N Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/cifsfs.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ed220daca3e1..92fd1a7e83dc 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -266,22 +266,24 @@ static void cifs_kill_sb(struct super_block *sb) * before we kill the sb. */ if (cifs_sb->root) { + node = rb_first(root); + while (node != NULL) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + tcon = tlink_tcon(tlink); + cfid = &tcon->crfid; + mutex_lock(&cfid->fid_mutex); + if (cfid->dentry) { + dput(cfid->dentry); + cfid->dentry = NULL; + } + mutex_unlock(&cfid->fid_mutex); + node = rb_next(node); + } + + /* finally release root dentry */ dput(cifs_sb->root); cifs_sb->root = NULL; } - node = rb_first(root); - while (node != NULL) { - tlink = rb_entry(node, struct tcon_link, tl_rbnode); - tcon = tlink_tcon(tlink); - cfid = &tcon->crfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry) { - dput(cfid->dentry); - cfid->dentry = NULL; - } - mutex_unlock(&cfid->fid_mutex); - node = rb_next(node); - } kill_anon_super(sb); cifs_umount(cifs_sb); -- cgit v1.2.3 From 22aa1597f462d2a917d1c6124cb33f520949936d Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 12 Apr 2022 09:30:39 -0700 Subject: io_uring: move io_uring_rsrc_update2 validation [ Upstream commit 565c5e616e8061b40a2e1d786c418a7ac3503a8d ] Move validation to be more consistently straight after copy_from_user. This is already done in io_register_rsrc_update and so this removes that redundant check. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220412163042.2788062-2-dylany@fb.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3d44d48b35ea..0568304a597a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10595,8 +10595,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, __u32 tmp; int err; - if (up->resv) - return -EINVAL; if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; err = io_rsrc_node_switch_start(ctx); @@ -10622,6 +10620,8 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; + if (up.resv) + return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } -- cgit v1.2.3 From 7a7c9f9de9616af339c0f59ccdcb43b996c0bf5c Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 12 Apr 2022 09:30:40 -0700 Subject: io_uring: verify that resv2 is 0 in io_uring_rsrc_update2 [ Upstream commit d8a3ba9c143bf89c032deced8a686ffa53b46098 ] Verify that the user does not pass in anything but 0 for this field. Fixes: 992da01aa932 ("io_uring: change registration/upd/rsrc tagging ABI") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220412163042.2788062-3-dylany@fb.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0568304a597a..66671c0bd864 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6403,6 +6403,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.nr = 0; up.tags = 0; up.resv = 0; + up.resv2 = 0; io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, @@ -10620,7 +10621,7 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; - if (up.resv) + if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } @@ -10634,7 +10635,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; - if (!up.nr || up.resv) + if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } -- cgit v1.2.3 From 9947548d9cef58f6be28b7bb0be53138626ff319 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 12 Apr 2022 09:30:42 -0700 Subject: io_uring: verify pad field is 0 in io_get_ext_arg [ Upstream commit d2347b9695dafe5c388a5f9aeb70e27a7a4d29cf ] Ensure that only 0 is passed for pad here. Fixes: c73ebb685fb6 ("io_uring: add timeout support for io_uring_enter()") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220412163042.2788062-5-dylany@fb.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 66671c0bd864..cc0a07a9fe9c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9981,6 +9981,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; + if (arg.pad) + return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask); *argsz = arg.sigmask_sz; *ts = u64_to_user_ptr(arg.ts); -- cgit v1.2.3 From 9901b07ba42b39266b34a888e48d7306fd707bee Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Wed, 13 Apr 2022 04:42:51 -0700 Subject: cifs: potential buffer overflow in handling symlinks [ Upstream commit 64c4a37ac04eeb43c42d272f6e6c8c12bfcf4304 ] Smatch printed a warning: arch/x86/crypto/poly1305_glue.c:198 poly1305_update_arch() error: __memcpy() 'dctx->buf' too small (16 vs u32max) It's caused because Smatch marks 'link_len' as untrusted since it comes from sscanf(). Add a check to ensure that 'link_len' is not larger than the size of the 'link_str' buffer. Fixes: c69c1b6eaea1 ("cifs: implement CIFSParseMFSymlink()") Signed-off-by: Harshit Mogalapalli Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/link.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 852e54ee82c2..bbdf3281559c 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -85,6 +85,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, if (rc != 1) return -EINVAL; + if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) + return -EINVAL; + rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); -- cgit v1.2.3 From bb93369f93b568eaa0c2fa56ef1979490372353d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 14 Mar 2022 10:55:32 -0700 Subject: btrfs: fix fallocate to use file_modified to update permissions consistently [ Upstream commit 05fd9564e9faf0f23b4676385e27d9405cef6637 ] Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Reviewed-by: Filipe Manana Signed-off-by: Darrick J. Wong Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/file.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a1762363f61f..dc1e4d1b7291 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2878,8 +2878,9 @@ out: return ret; } -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; @@ -2911,6 +2912,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } + ret = file_modified(file); + if (ret) + goto out_only_mutex; + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; @@ -3351,7 +3356,7 @@ static long btrfs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return btrfs_punch_hole(inode, offset, len); + return btrfs_punch_hole(file, offset, len); /* * Only trigger disk allocation, don't trigger qgroup reserve @@ -3373,6 +3378,10 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } + ret = file_modified(file); + if (ret) + goto out; + /* * TODO: Move these two operations after we have checked * accurate reserved space, or fallocate can still fail but -- cgit v1.2.3 From 01dcda701fefd8259b8a9518135730be08c2bad5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 23 Mar 2022 11:30:36 -0400 Subject: btrfs: do not warn for free space inode in cow_file_range [ Upstream commit a7d16d9a07bbcb7dcd5214a1bea75c808830bc0d ] This is a long time leftover from when I originally added the free space inode, the point was to catch cases where we weren't honoring the NOCOW flag. However there exists a race with relocation, if we allocate our free space inode in a block group that is about to be relocated, we could trigger the COW path before the relocation has the opportunity to find the extents and delete the free space cache. In production where we have auto-relocation enabled we're seeing this WARN_ON_ONCE() around 5k times in a 2 week period, so not super common but enough that it's at the top of our metrics. We're properly handling the error here, and with us phasing out v1 space cache anyway just drop the WARN_ON_ONCE. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7465ef012e22..6266a706bff7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1075,7 +1075,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, int ret = 0; if (btrfs_is_free_space_inode(inode)) { - WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; } -- cgit v1.2.3 From 5218d5cc02833e9495bba401d6af7d7801fb22cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Apr 2022 14:05:04 +0100 Subject: io_uring: zero tag on rsrc removal [ Upstream commit 8f0a24801bb44aa58496945aabb904c729176772 ] Automatically default rsrc tag in io_queue_rsrc_removal(), it's safer than leaving it there and relying on the rest of the code to behave and not use it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1cf262a50df17478ea25b22494dcc19f3a80301f.1649336342.git.asml.silence@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index cc0a07a9fe9c..ca207e9a87cd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8413,13 +8413,15 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc) { + u64 *tag_slot = io_get_tag_slot(data, idx); struct io_rsrc_put *prsrc; prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); if (!prsrc) return -ENOMEM; - prsrc->tag = *io_get_tag_slot(data, idx); + prsrc->tag = *tag_slot; + *tag_slot = 0; prsrc->rsrc = rsrc; list_add(&prsrc->list, &node->rsrc_list); return 0; -- cgit v1.2.3 From b8ed0f7531f30c7ab5ee5e83b704fecb86acf3dc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Apr 2022 14:05:05 +0100 Subject: io_uring: use nospec annotation for more indexes [ Upstream commit 4cdd158be9d09223737df83136a1fb65269d809a ] There are still several places that using pre array_index_nospec() indexes, fix them up. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b01ef5ee83f72ed35ad525912370b729f5d145f4.1649336342.git.asml.silence@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ca207e9a87cd..1bf1ea2cd8b0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8489,7 +8489,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_fixed_file *file_slot; struct file *file; - int ret, i; + int ret; io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); ret = -ENXIO; @@ -8502,8 +8502,8 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) if (ret) goto out; - i = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); + offset = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, offset); ret = -EBADF; if (!file_slot->file_ptr) goto out; @@ -8559,8 +8559,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (file_slot->file_ptr) { file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, up->offset + done, - ctx->rsrc_node, file); + err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); if (err) break; file_slot->file_ptr = 0; @@ -9229,7 +9228,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(offset, ctx->nr_user_bufs); if (ctx->user_bufs[i] != ctx->dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, offset, + err = io_queue_rsrc_removal(ctx->buf_data, i, ctx->rsrc_node, ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); -- cgit v1.2.3 From 7a509a9f2bb7b7edce6e26242fceaf0d212369f9 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 13 Apr 2022 10:02:17 +1000 Subject: cifs: verify that tcon is valid before dereference in cifs_kill_sb commit 8b6c58458ee3206dde345fce327a4cb83e69caf9 upstream. On umount, cifs_sb->tlink_tree might contain entries that do not represent a valid tcon. Check the tcon for error before we dereference it. Signed-off-by: Ronnie Sahlberg Cc: stable@vger.kernel.org Reviewed-by: Shyam Prasad N Reported-by: Xiaoli Feng Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 92fd1a7e83dc..29a019cf1d5f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -266,10 +266,11 @@ static void cifs_kill_sb(struct super_block *sb) * before we kill the sb. */ if (cifs_sb->root) { - node = rb_first(root); - while (node != NULL) { + for (node = rb_first(root); node; node = rb_next(node)) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); tcon = tlink_tcon(tlink); + if (IS_ERR(tcon)) + continue; cfid = &tcon->crfid; mutex_lock(&cfid->fid_mutex); if (cfid->dentry) { @@ -277,7 +278,6 @@ static void cifs_kill_sb(struct super_block *sb) cfid->dentry = NULL; } mutex_unlock(&cfid->fid_mutex); - node = rb_next(node); } /* finally release root dentry */ -- cgit v1.2.3 From 252db93fd0bd5ca07c9b933ed94e93a4a43e8901 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Thu, 24 Mar 2022 06:44:54 -0700 Subject: btrfs: fix root ref counts in error handling in btrfs_get_root_ref commit 168a2f776b9762f4021421008512dd7ab7474df1 upstream. In btrfs_get_root_ref(), when btrfs_insert_fs_root() fails, btrfs_put_root() can happen for two reasons: - the root already exists in the tree, in that case it returns the reference obtained in btrfs_lookup_fs_root() - another error so the cleanup is done in the fail label Calling btrfs_put_root() unconditionally would lead to double decrement of the root reference possibly freeing it in the second case. Reported-by: TOTE Robot Fixes: bc44d7c4b2b1 ("btrfs: push btrfs_grab_fs_root into btrfs_get_fs_root") CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Jia-Ju Bai Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aed1f26ca2b8..f1f7dbfa6ecd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1738,9 +1738,10 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_root(root); - if (ret == -EEXIST) + if (ret == -EEXIST) { + btrfs_put_root(root); goto again; + } goto fail; } return root; -- cgit v1.2.3 From 3680b48533aec77fd21875c0c11398e1c2774f1e Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 29 Mar 2022 15:55:58 +0900 Subject: btrfs: mark resumed async balance as writing commit a690e5f2db4d1dca742ce734aaff9f3112d63764 upstream. When btrfs balance is interrupted with umount, the background balance resumes on the next mount. There is a potential deadlock with FS freezing here like as described in commit 26559780b953 ("btrfs: zoned: mark relocation as writing"). Mark the process as sb_writing to avoid it. Reviewed-by: Filipe Manana CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f6d6825f218a..471cc4706a07 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4389,10 +4389,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; + sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); return ret; } -- cgit v1.2.3 From 4b7617ae04de31fe96aae445a35395078b6eefd6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:24 +0200 Subject: fs: remove __sync_filesystem [ Upstream commit 9a208ba5c9afa62c7b1e9c6f5e783066e84e2d3c ] There is no clear benefit in having this helper vs just open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20211019062530.2174626-2-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/sync.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/sync.c b/fs/sync.c index 1373a610dc78..0d6cdc507cb9 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -21,25 +21,6 @@ #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) -/* - * Do the filesystem syncing work. For simple filesystems - * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to - * submit IO for these buffers via __sync_blockdev(). This also speeds up the - * wait == 1 case since in that case write_inode() functions do - * sync_dirty_buffer() and thus effectively write one block at a time. - */ -static int __sync_filesystem(struct super_block *sb, int wait) -{ - if (wait) - sync_inodes_sb(sb); - else - writeback_inodes_sb(sb, WB_REASON_SYNC); - - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, wait); - return __sync_blockdev(sb->s_bdev, wait); -} - /* * Write out and wait upon all dirty data associated with this * superblock. Filesystem data as well as the underlying block @@ -61,10 +42,25 @@ int sync_filesystem(struct super_block *sb) if (sb_rdonly(sb)) return 0; - ret = __sync_filesystem(sb, 0); + /* + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have + * to submit I/O for these buffers via __sync_blockdev(). This also + * speeds up the wait == 1 case since in that case write_inode() + * methods call sync_dirty_buffer() and thus effectively write one block + * at a time. + */ + writeback_inodes_sb(sb, WB_REASON_SYNC); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 0); + ret = __sync_blockdev(sb->s_bdev, 0); if (ret < 0) return ret; - return __sync_filesystem(sb, 1); + + sync_inodes_sb(sb); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + return __sync_blockdev(sb->s_bdev, 1); } EXPORT_SYMBOL(sync_filesystem); -- cgit v1.2.3 From 7877e7a5a52e1c9716326b94dfe6c7e6cd7bce5a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:25 +0200 Subject: block: remove __sync_blockdev [ Upstream commit 70164eb6ccb76ab679b016b4b60123bf4ec6c162 ] Instead offer a new sync_blockdev_nowait helper for the !wait case. This new helper is exported as it will grow modular callers in a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-3-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/bdev.c | 11 ++++++----- fs/internal.h | 5 ----- fs/sync.c | 7 ++++--- include/linux/blkdev.h | 5 +++++ 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/block/bdev.c b/block/bdev.c index 485a258b0ab3..33cac289302e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -184,14 +184,13 @@ int sb_min_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_min_blocksize); -int __sync_blockdev(struct block_device *bdev, int wait) +int sync_blockdev_nowait(struct block_device *bdev) { if (!bdev) return 0; - if (!wait) - return filemap_flush(bdev->bd_inode->i_mapping); - return filemap_write_and_wait(bdev->bd_inode->i_mapping); + return filemap_flush(bdev->bd_inode->i_mapping); } +EXPORT_SYMBOL_GPL(sync_blockdev_nowait); /* * Write out and wait upon all the dirty data associated with a block @@ -199,7 +198,9 @@ int __sync_blockdev(struct block_device *bdev, int wait) */ int sync_blockdev(struct block_device *bdev) { - return __sync_blockdev(bdev, 1); + if (!bdev) + return 0; + return filemap_write_and_wait(bdev->bd_inode->i_mapping); } EXPORT_SYMBOL(sync_blockdev); diff --git a/fs/internal.h b/fs/internal.h index 3cd065c8a66b..b5caa16f4645 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,7 +23,6 @@ struct pipe_inode_info; #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); -extern int __sync_blockdev(struct block_device *bdev, int wait); void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); #else @@ -31,10 +30,6 @@ static inline void bdev_cache_init(void) { } -static inline int __sync_blockdev(struct block_device *bdev, int wait) -{ - return 0; -} static inline void iterate_bdevs(void (*f)(struct block_device *, void *), void *arg) { diff --git a/fs/sync.c b/fs/sync.c index 0d6cdc507cb9..a621089eb07e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -3,6 +3,7 @@ * High-level sync()-related operations */ +#include #include #include #include @@ -45,7 +46,7 @@ int sync_filesystem(struct super_block *sb) /* * Do the filesystem syncing work. For simple filesystems * writeback_inodes_sb(sb) just dirties buffers with inodes so we have - * to submit I/O for these buffers via __sync_blockdev(). This also + * to submit I/O for these buffers via sync_blockdev(). This also * speeds up the wait == 1 case since in that case write_inode() * methods call sync_dirty_buffer() and thus effectively write one block * at a time. @@ -53,14 +54,14 @@ int sync_filesystem(struct super_block *sb) writeback_inodes_sb(sb, WB_REASON_SYNC); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, 0); - ret = __sync_blockdev(sb->s_bdev, 0); + ret = sync_blockdev_nowait(sb->s_bdev); if (ret < 0) return ret; sync_inodes_sb(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, 1); - return __sync_blockdev(sb->s_bdev, 1); + return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 413c0148c0ce..6bbd393e6bcc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1999,6 +1999,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); +int sync_blockdev_nowait(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -2007,6 +2008,10 @@ static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline int sync_blockdev_nowait(struct block_device *bdev) +{ + return 0; +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From 6eb927ee189f39746bcb02123d270ef04457eab6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:30 +0200 Subject: block: simplify the block device syncing code [ Upstream commit 1e03a36bdff4709c1bbf0f57f60ae3f776d51adf ] Get rid of the indirections and just provide a sync_bdevs helper for the generic sync code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-8-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/bdev.c | 17 ++++++++++++++--- fs/internal.h | 6 ------ fs/sync.c | 23 ++++------------------- include/linux/blkdev.h | 4 ++++ 4 files changed, 22 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/block/bdev.c b/block/bdev.c index 33cac289302e..18abafb135e0 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1017,7 +1017,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) } EXPORT_SYMBOL(__invalidate_device); -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; @@ -1048,8 +1048,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) - func(bdev, arg); + if (!bdev->bd_openers) { + ; /* skip */ + } else if (wait) { + /* + * We keep the error status of individual mapping so + * that applications can catch the writeback error using + * fsync(2). See filemap_fdatawait_keep_errors() for + * details. + */ + filemap_fdatawait_keep_errors(inode->i_mapping); + } else { + filemap_fdatawrite(inode->i_mapping); + } mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); diff --git a/fs/internal.h b/fs/internal.h index b5caa16f4645..cdd83d4899bb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,17 +23,11 @@ struct pipe_inode_info; #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); -void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } - -static inline void iterate_bdevs(void (*f)(struct block_device *, void *), - void *arg) -{ -} static inline int emergency_thaw_bdev(struct super_block *sb) { return 0; diff --git a/fs/sync.c b/fs/sync.c index a621089eb07e..3ce8e2137f31 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -78,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg) sb->s_op->sync_fs(sb, *(int *)arg); } -static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) -{ - filemap_fdatawrite(bdev->bd_inode->i_mapping); -} - -static void fdatawait_one_bdev(struct block_device *bdev, void *arg) -{ - /* - * We keep the error status of individual mapping so that - * applications can catch the writeback error using fsync(2). - * See filemap_fdatawait_keep_errors() for details. - */ - filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); -} - /* * Sync everything. We start by waking flusher threads so that most of * writeback runs on all devices in parallel. Then we sync all inodes reliably @@ -111,8 +96,8 @@ void ksys_sync(void) iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); - iterate_bdevs(fdatawrite_one_bdev, NULL); - iterate_bdevs(fdatawait_one_bdev, NULL); + sync_bdevs(false); + sync_bdevs(true); if (unlikely(laptop_mode)) laptop_sync_completion(); } @@ -133,10 +118,10 @@ static void do_sync_work(struct work_struct *work) */ iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6bbd393e6bcc..aebe67ed7a73 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -2000,6 +2000,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); int sync_blockdev_nowait(struct block_device *bdev); +void sync_bdevs(bool wait); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -2012,6 +2013,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) { return 0; } +static inline void sync_bdevs(bool wait) +{ +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From 935745abcf4c695a18b9af3fbe295e322547a114 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:16 -0800 Subject: vfs: make sync_filesystem return errors from ->sync_fs [ Upstream commit 5679897eb104cec9e99609c3f045a0c20603da4c ] Strangely, sync_filesystem ignores the return code from the ->sync_fs call, which means that syscalls like syncfs(2) never see the error. This doesn't seem right, so fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/sync.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/sync.c b/fs/sync.c index 3ce8e2137f31..c7690016453e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -29,7 +29,7 @@ */ int sync_filesystem(struct super_block *sb) { - int ret; + int ret = 0; /* * We need to be protected against the filesystem going from @@ -52,15 +52,21 @@ int sync_filesystem(struct super_block *sb) * at a time. */ writeback_inodes_sb(sb, WB_REASON_SYNC); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 0); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 0); + if (ret) + return ret; + } ret = sync_blockdev_nowait(sb->s_bdev); - if (ret < 0) + if (ret) return ret; sync_inodes_sb(sb); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); -- cgit v1.2.3 From 155ae0547cb8c8b7b5fa96891dbaba0c664b023b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:17 -0800 Subject: xfs: return errors in xfs_fs_sync_fs [ Upstream commit 2d86293c70750e4331e9616aded33ab6b47c299d ] Now that the VFS will do something with the return values from ->sync_fs, make ours pass on error codes. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/xfs/xfs_super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c4e0cd1c1c8c..170fee98c45c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -729,6 +729,7 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); + int error; trace_xfs_fs_sync_fs(mp, __return_address); @@ -738,7 +739,10 @@ xfs_fs_sync_fs( if (!wait) return 0; - xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + if (laptop_mode) { /* * The disk must be active because we're syncing. -- cgit v1.2.3 From c2d0cdf8ad06ec1d1687729e5c5ccdd7746343e8 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 17 Jan 2022 10:25:07 -0500 Subject: gfs2: assign rgrp glock before compute_bitstructs commit 428f651cb80b227af47fc302e4931791f2fb4741 upstream. Before this patch, function read_rindex_entry called compute_bitstructs before it allocated a glock for the rgrp. But if compute_bitstructs found a problem with the rgrp, it called gfs2_consist_rgrpd, and that called gfs2_dump_glock for rgd->rd_gl which had not yet been assigned. read_rindex_entry compute_bitstructs gfs2_consist_rgrpd gfs2_dump_glock <---------rgd->rd_gl was not set. This patch changes read_rindex_entry so it assigns an rgrp glock before calling compute_bitstructs so gfs2_dump_glock does not reference an unassigned pointer. If an error is discovered, the glock must also be put, so a new goto and label were added. Reported-by: syzbot+c6fd14145e2f62ca0784@syzkaller.appspotmail.com Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/rgrp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 403cf6f1eb8c..6901cd85f1df 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -923,15 +923,15 @@ static int read_rindex_entry(struct gfs2_inode *ip) spin_lock_init(&rgd->rd_rsspin); mutex_init(&rgd->rd_mutex); - error = compute_bitstructs(rgd); - if (error) - goto fail; - error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) goto fail; + error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -945,6 +945,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) } error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: gfs2_glock_put(rgd->rd_gl); fail: -- cgit v1.2.3 From 213330bafd021eaa378d4aa91d40ba2736abd4de Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 7 Apr 2022 00:03:14 +0100 Subject: cifs: Check the IOCB_DIRECT flag, not O_DIRECT [ Upstream commit 994fd530a512597ffcd713b0f6d5bc916c5698f0 ] Use the IOCB_DIRECT indicator flag on the I/O context rather than checking to see if the file was opened O_DIRECT. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Rohith Surabattula cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 29a019cf1d5f..8f8d281e3151 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -936,7 +936,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); - if (iocb->ki_filp->f_flags & O_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) return cifs_user_readv(iocb, iter); rc = cifs_revalidate_mapping(inode); -- cgit v1.2.3 From 773ca67ffc964785fae7e950e164ed845c9b365d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 12 Apr 2022 05:41:00 -0400 Subject: stat: fix inconsistency between struct stat and struct compat_stat [ Upstream commit 932aba1e169090357a77af18850a10c256b50819 ] struct stat (defined in arch/x86/include/uapi/asm/stat.h) has 32-bit st_dev and st_rdev; struct compat_stat (defined in arch/x86/include/asm/compat.h) has 16-bit st_dev and st_rdev followed by a 16-bit padding. This patch fixes struct compat_stat to match struct stat. [ Historical note: the old x86 'struct stat' did have that 16-bit field that the compat layer had kept around, but it was changes back in 2003 by "struct stat - support larger dev_t": https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=e95b2065677fe32512a597a79db94b77b90c968d and back in those days, the x86_64 port was still new, and separate from the i386 code, and had already picked up the old version with a 16-bit st_dev field ] Note that we can't change compat_dev_t because it is used by compat_loop_info. Also, if the st_dev and st_rdev values are 32-bit, we don't have to use old_valid_dev to test if the value fits into them. This fixes -EOVERFLOW on filesystems that are on NVMe because NVMe uses the major number 259. Signed-off-by: Mikulas Patocka Cc: Andreas Schwab Cc: Matthew Wilcox Cc: Christoph Hellwig Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- arch/x86/include/asm/compat.h | 6 ++---- fs/stat.c | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 7516e4199b3c..20fd0acd7d80 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -28,15 +28,13 @@ typedef u16 compat_ipc_pid_t; typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { - compat_dev_t st_dev; - u16 __pad1; + u32 st_dev; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; - compat_dev_t st_rdev; - u16 __pad2; + u32 st_rdev; u32 st_size; u32 st_blksize; u32 st_blocks; diff --git a/fs/stat.c b/fs/stat.c index 28d2020ba1f4..246d138ec066 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -334,9 +334,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev(x),true) -#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif @@ -345,7 +342,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; - if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -353,7 +352,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) #endif INIT_STRUCT_STAT_PADDING(tmp); - tmp.st_dev = encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -363,7 +362,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -644,11 +643,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -658,7 +659,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; -- cgit v1.2.3 From 4a9f9f1791f331ca53ac89f3d5e6e2c9b482d66c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Apr 2022 13:57:35 +1000 Subject: VFS: filename_create(): fix incorrect intent. [ Upstream commit b3d4650d82c71b9c9a8184de9e8bb656012b289e ] When asked to create a path ending '/', but which is not to be a directory (LOOKUP_DIRECTORY not set), filename_create() will never try to create the file. If it doesn't exist, -ENOENT is reported. However, it still passes LOOKUP_CREATE|LOOKUP_EXCL to the filesystems ->lookup() function, even though there is no intent to create. This is misleading and can cause incorrect behaviour. If you try ln -s foo /path/dir/ where 'dir' is a directory on an NFS filesystem which is not currently known in the dcache, this will fail with ENOENT. But as the name is not in the dcache, nfs_lookup gets called with LOOKUP_CREATE|LOOKUP_EXCL and so it returns NULL without performing any lookup, with the expectation that a subsequent call to create the target will be made, and the lookup can be combined with the creation. In the case with a trailing '/' and no LOOKUP_DIRECTORY, that call is never made. Instead filename_create() sees that the dentry is not (yet) positive and returns -ENOENT - even though the directory actually exists. So only set LOOKUP_CREATE|LOOKUP_EXCL if there really is an intent to create, and use the absence of these flags to decide if -ENOENT should be returned. Note that filename_parentat() is only interested in LOOKUP_REVAL, so we split that out and store it in 'reval_flag'. __lookup_hash() then gets reval_flag combined with whatever create flags were determined to be needed. Reviewed-by: David Disseldorp Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Cc: Al Viro Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/namei.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 3bb65f48fe1d..8882a70dc119 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3625,18 +3625,14 @@ static struct dentry *filename_create(int dfd, struct filename *name, { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; + bool want_dir = lookup_flags & LOOKUP_DIRECTORY; + unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; + unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; - bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); - /* - * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any - * other flags passed in are ignored! - */ - lookup_flags &= LOOKUP_REVAL; - - error = filename_parentat(dfd, name, lookup_flags, path, &last, &type); + error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); @@ -3650,11 +3646,13 @@ static struct dentry *filename_create(int dfd, struct filename *name, /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* - * Do the final lookup. + * Do the final lookup. Suppress 'create' if there is a trailing + * '/', and a directory wasn't requested. */ - lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; + if (last.name[last.len] && !want_dir) + create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path->dentry, lookup_flags); + dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; @@ -3668,7 +3666,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ - if (unlikely(!is_dir && last.name[last.len])) { + if (unlikely(!create_flags)) { error = -ENOENT; goto fail; } -- cgit v1.2.3 From 9dcb65cdf3128113ce9a48b05a72b6f8ef2bc257 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Apr 2022 16:35:46 -0700 Subject: mm, hugetlb: allow for "high" userspace addresses commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream. This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") for hugetlb. This patch adds support for "high" userspace addresses that are optionally supported on the system and have to be requested via a hint mechanism ("high" addr parameter to mmap). Architectures such as powerpc and x86 achieve this by making changes to their architectural versions of hugetlb_get_unmapped_area() function. However, arm64 uses the generic version of that function. So take into account arch_get_mmap_base() and arch_get_mmap_end() in hugetlb_get_unmapped_area(). To allow that, move those two macros out of mm/mmap.c into include/linux/sched/mm.h If these macros are not defined in architectural code then they default to (TASK_SIZE) and (base) so should not introduce any behavioural changes to architectures that do not define them. For the time being, only ARM64 is affected by this change. Catalin (ARM64) said "We should have fixed hugetlb_get_unmapped_area() as well when we added support for 52-bit VA. The reason for commit f6795053dac8 was to prevent normal mmap() from returning addresses above 48-bit by default as some user-space had hard assumptions about this. It's a slight ABI change if you do this for hugetlb_get_unmapped_area() but I doubt anyone would notice. It's more likely that the current behaviour would cause issues, so I'd rather have them consistent. Basically when arm64 gained support for 52-bit addresses we did not want user-space calling mmap() to suddenly get such high addresses, otherwise we could have inadvertently broken some programs (similar behaviour to x86 here). Hence we added commit f6795053dac8. But we missed hugetlbfs which could still get such high mmap() addresses. So in theory that's a potential regression that should have bee addressed at the same time as commit f6795053dac8 (and before arm64 enabled 52-bit addresses)" Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") Signed-off-by: Christophe Leroy Reviewed-by: Catalin Marinas Cc: Steve Capper Cc: Will Deacon Cc: [5.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hugetlbfs/inode.c | 9 +++++---- include/linux/sched/mm.h | 8 ++++++++ mm/mmap.c | 8 -------- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 54c4e0b0dda4..bb0651a4a128 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); addr = vm_unmapped_area(&info); } @@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5561486fddef..95fb7aaaec8d 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -106,6 +106,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long diff --git a/mm/mmap.c b/mm/mmap.c index 049b8e5c18f0..6bb553ed5c55 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2113,14 +2113,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * -- cgit v1.2.3 From ba50ea456f49f401ddea107ca9bae5d4d2ec0234 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Mar 2022 10:50:43 -0800 Subject: ext4: fix fallocate to use file_modified to update permissions consistently commit ad5cd4f4ee4d5fcdb1bfb7a0c073072961e70783 upstream. Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero, collapse, insert range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220308185043.GA117678@magnolia Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 32 +++++++++++++++++++++++++------- fs/ext4/inode.c | 7 ++++++- 3 files changed, 32 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index db981619f6c8..016d8e4ce0d0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3027,7 +3027,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); -extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b81c008e6675..44d00951e609 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4504,9 +4504,9 @@ retry: return ret > 0 ? ret2 : ret; } -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) @@ -4578,6 +4578,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4696,7 +4700,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ext4_fc_start_update(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(inode, offset, len); + ret = ext4_punch_hole(file, offset, len); goto exit; } @@ -4705,12 +4709,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto exit; if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(inode, offset, len); + ret = ext4_collapse_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(inode, offset, len); + ret = ext4_insert_range(file, offset, len); goto exit; } @@ -4746,6 +4750,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out; + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); if (ret) goto out; @@ -5248,8 +5256,9 @@ out: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; @@ -5301,6 +5310,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. @@ -5394,8 +5407,9 @@ out_mutex: * by len bytes. * Returns 0 on success, error otherwise. */ -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; handle_t *handle; @@ -5452,6 +5466,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fff52292c01e..fbb590d95095 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3939,8 +3939,9 @@ int ext4_break_layouts(struct inode *inode) * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; @@ -4002,6 +4003,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. -- cgit v1.2.3 From 8bb5676b49d3a44c4aed121dd3378f4d95da8e55 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 21 Mar 2022 22:44:38 +0800 Subject: ext4: fix symlink file size not match to file content commit a2b0b205d125f27cddfb4f7280e39affdaf46686 upstream. We got issue as follows: [home]# fsck.ext4 -fn ram0yb e2fsck 1.45.6 (20-Mar-2020) Pass 1: Checking inodes, blocks, and sizes Pass 2: Checking directory structure Symlink /p3/d14/d1a/l3d (inode #3494) is invalid. Clear? no Entry 'l3d' in /p3/d14/d1a (3383) has an incorrect filetype (was 7, should be 0). Fix? no As the symlink file size does not match the file content. If the writeback of the symlink data block failed, ext4_finish_bio() handles the end of IO. However this function fails to mark the buffer with BH_write_io_error and so when unmount does journal checkpoint it cannot detect the writeback error and will cleanup the journal. Thus we've lost the correct data in the journal area. To solve this issue, mark the buffer as BH_write_io_error in ext4_finish_bio(). Cc: stable@kernel.org Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220321144438.201685-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/page-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index f038d578d8d8..18977ff8e493 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -134,8 +134,10 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_status) + if (bio->bi_status) { + set_buffer_write_io_error(bh); buffer_io_error(bh); + } } while ((bh = bh->b_this_page) != head); spin_unlock_irqrestore(&head->b_uptodate_lock, flags); if (!under_io) { -- cgit v1.2.3 From e3912775b4766a81cc80279ccb3740d514926ad0 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 24 Mar 2022 14:48:16 +0800 Subject: ext4: fix use-after-free in ext4_search_dir commit c186f0887fe7061a35cebef024550ec33ef8fbd8 upstream. We got issue as follows: EXT4-fs (loop0): mounted filesystem without journal. Opts: ,errors=continue ================================================================== BUG: KASAN: use-after-free in ext4_search_dir fs/ext4/namei.c:1394 [inline] BUG: KASAN: use-after-free in search_dirblock fs/ext4/namei.c:1199 [inline] BUG: KASAN: use-after-free in __ext4_find_entry+0xdca/0x1210 fs/ext4/namei.c:1553 Read of size 1 at addr ffff8881317c3005 by task syz-executor117/2331 CPU: 1 PID: 2331 Comm: syz-executor117 Not tainted 5.10.0+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:83 [inline] dump_stack+0x144/0x187 lib/dump_stack.c:124 print_address_description+0x7d/0x630 mm/kasan/report.c:387 __kasan_report+0x132/0x190 mm/kasan/report.c:547 kasan_report+0x47/0x60 mm/kasan/report.c:564 ext4_search_dir fs/ext4/namei.c:1394 [inline] search_dirblock fs/ext4/namei.c:1199 [inline] __ext4_find_entry+0xdca/0x1210 fs/ext4/namei.c:1553 ext4_lookup_entry fs/ext4/namei.c:1622 [inline] ext4_lookup+0xb8/0x3a0 fs/ext4/namei.c:1690 __lookup_hash+0xc5/0x190 fs/namei.c:1451 do_rmdir+0x19e/0x310 fs/namei.c:3760 do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x445e59 Code: 4d c7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 1b c7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fff2277fac8 EFLAGS: 00000246 ORIG_RAX: 0000000000000054 RAX: ffffffffffffffda RBX: 0000000000400280 RCX: 0000000000445e59 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000200000c0 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000002 R10: 00007fff2277f990 R11: 0000000000000246 R12: 0000000000000000 R13: 431bde82d7b634db R14: 0000000000000000 R15: 0000000000000000 The buggy address belongs to the page: page:0000000048cd3304 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1317c3 flags: 0x200000000000000() raw: 0200000000000000 ffffea0004526588 ffffea0004528088 0000000000000000 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881317c2f00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8881317c2f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff8881317c3000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff8881317c3080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8881317c3100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== ext4_search_dir: ... de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { ... if ((char *) de + de->name_len <= dlimit && ext4_match(dir, fname, de)) { ... } ... de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) return -1; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } Assume: de=0xffff8881317c2fff dlimit=0x0xffff8881317c3000 If read 'de->name_len' which address is 0xffff8881317c3005, obviously is out of range, then will trigger use-after-free. To solve this issue, 'dlimit' must reserve 8 bytes, as we will read 'de->name_len' to judge if '(char *) de + de->name_len' out of range. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220324064816.1209985-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 4 ++++ fs/ext4/namei.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 016d8e4ce0d0..a0a987857894 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2267,6 +2267,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) * Structure of a directory entry */ #define EXT4_NAME_LEN 255 +/* + * Base length of the ext4 directory entry excluding the name length + */ +#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8cb5ea7ee506..19c620118e62 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1466,10 +1466,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; - while ((char *) de < dlimit) { + while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit && + if (de->name + de->name_len <= dlimit && ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ -- cgit v1.2.3 From 9b90003771e5112e73d362ba4f4df03c7064ddc9 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Thu, 31 Mar 2022 13:05:15 -0700 Subject: ext4: limit length to bitmap_maxbytes - blocksize in punch_hole commit 2da376228a2427501feb9d15815a45dbdbdd753e upstream. Syzbot found an issue [1] in ext4_fallocate(). The C reproducer [2] calls fallocate(), passing size 0xffeffeff000ul, and offset 0x1000000ul, which, when added together exceed the bitmap_maxbytes for the inode. This triggers a BUG in ext4_ind_remove_space(). According to the comments in this function the 'end' parameter needs to be one block after the last block to be removed. In the case when the BUG is triggered it points to the last block. Modify the ext4_punch_hole() function and add constraint that caps the length to satisfy the one before laster block requirement. LINK: [1] https://syzkaller.appspot.com/bug?id=b80bd9cf348aac724a4f4dff251800106d721331 LINK: [2] https://syzkaller.appspot.com/text?tag=ReproC&x=14ba0238700000 Fixes: a4bb6b64e39a ("ext4: enable "punch hole" functionality") Reported-by: syzbot+7a806094edd5d07ba029@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk Link: https://lore.kernel.org/r/20220331200515.153214-1-tadeusz.struk@linaro.org Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fbb590d95095..db73b49bd979 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3945,7 +3945,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset; + loff_t first_block_offset, last_block_offset, max_length; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; @@ -3988,6 +3989,14 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) offset; } + /* + * For punch hole the length + offset needs to be within one block + * before last range. Adjust the length if it goes beyond that limit. + */ + max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; + if (offset + length > max_length) + length = max_length - offset; + if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* -- cgit v1.2.3 From 52ca84a3edd1914e575450bcd1ce6cbc6e15e2cb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Apr 2022 21:31:27 -0400 Subject: ext4: fix overhead calculation to account for the reserved gdt blocks commit 10b01ee92df52c8d7200afead4d5e5f55a5c58b1 upstream. The kernel calculation was underestimating the overhead by not taking into account the reserved gdt blocks. With this change, the overhead calculated by the kernel matches the overhead calculation in mke2fs. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fd4d34deb9fc..0d2242d41ce2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3697,9 +3697,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; + int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) - return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + return (has_super + ext4_bg_num_gdb(sb, grp) + + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + -- cgit v1.2.3 From 2b273d1fd18ebebdc5e99139f0c89b142d40ea9c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Apr 2022 21:57:49 -0400 Subject: ext4: force overhead calculation if the s_overhead_cluster makes no sense commit 85d825dbf4899a69407338bae462a59aa9a37326 upstream. If the file system does not use bigalloc, calculating the overhead is cheap, so force the recalculation of the overhead so we don't have to trust the precalculated overhead in the superblock. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0d2242d41ce2..fa21d8180319 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4788,9 +4788,18 @@ no_journal: * Get the # of file system overhead blocks from the * superblock if present. */ - if (es->s_overhead_clusters) - sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); - else { + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + /* ignore the precalculated value if it is ridiculous */ + if (sbi->s_overhead > ext4_blocks_count(es)) + sbi->s_overhead = 0; + /* + * If the bigalloc feature is not enabled recalculating the + * overhead doesn't take long, so we might as well just redo + * it to make sure we are using the correct value. + */ + if (!ext4_has_feature_bigalloc(sb)) + sbi->s_overhead = 0; + if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; -- cgit v1.2.3 From b1b8f39c2475a1df597e6d1970e25bd64c89d774 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 17 Mar 2022 22:21:37 +0800 Subject: jbd2: fix a potential race while discarding reserved buffers after an abort commit 23e3d7f7061f8682c751c46512718f47580ad8f0 upstream. we got issue as follows: [ 72.796117] EXT4-fs error (device sda): ext4_journal_check_start:83: comm fallocate: Detected aborted journal [ 72.826847] EXT4-fs (sda): Remounting filesystem read-only fallocate: fallocate failed: Read-only file system [ 74.791830] jbd2_journal_commit_transaction: jh=0xffff9cfefe725d90 bh=0x0000000000000000 end delay [ 74.793597] ------------[ cut here ]------------ [ 74.794203] kernel BUG at fs/jbd2/transaction.c:2063! [ 74.794886] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 74.795533] CPU: 4 PID: 2260 Comm: jbd2/sda-8 Not tainted 5.17.0-rc8-next-20220315-dirty #150 [ 74.798327] RIP: 0010:__jbd2_journal_unfile_buffer+0x3e/0x60 [ 74.801971] RSP: 0018:ffffa828c24a3cb8 EFLAGS: 00010202 [ 74.802694] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 74.803601] RDX: 0000000000000001 RSI: ffff9cfefe725d90 RDI: ffff9cfefe725d90 [ 74.804554] RBP: ffff9cfefe725d90 R08: 0000000000000000 R09: ffffa828c24a3b20 [ 74.805471] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9cfefe725d90 [ 74.806385] R13: ffff9cfefe725d98 R14: 0000000000000000 R15: ffff9cfe833a4d00 [ 74.807301] FS: 0000000000000000(0000) GS:ffff9d01afb00000(0000) knlGS:0000000000000000 [ 74.808338] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 74.809084] CR2: 00007f2b81bf4000 CR3: 0000000100056000 CR4: 00000000000006e0 [ 74.810047] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 74.810981] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 74.811897] Call Trace: [ 74.812241] [ 74.812566] __jbd2_journal_refile_buffer+0x12f/0x180 [ 74.813246] jbd2_journal_refile_buffer+0x4c/0xa0 [ 74.813869] jbd2_journal_commit_transaction.cold+0xa1/0x148 [ 74.817550] kjournald2+0xf8/0x3e0 [ 74.819056] kthread+0x153/0x1c0 [ 74.819963] ret_from_fork+0x22/0x30 Above issue may happen as follows: write truncate kjournald2 generic_perform_write ext4_write_begin ext4_walk_page_buffers do_journal_get_write_access ->add BJ_Reserved list ext4_journalled_write_end ext4_walk_page_buffers write_end_fn ext4_handle_dirty_metadata ***************JBD2 ABORT************** jbd2_journal_dirty_metadata -> return -EROFS, jh in reserved_list jbd2_journal_commit_transaction while (commit_transaction->t_reserved_list) jh = commit_transaction->t_reserved_list; truncate_pagecache_range do_invalidatepage ext4_journalled_invalidatepage jbd2_journal_invalidatepage journal_unmap_buffer __dispose_buffer __jbd2_journal_unfile_buffer jbd2_journal_put_journal_head ->put last ref_count __journal_remove_journal_head bh->b_private = NULL; jh->b_bh = NULL; jbd2_journal_refile_buffer(journal, jh); bh = jh2bh(jh); ->bh is NULL, later will trigger null-ptr-deref journal_free_journal_head(jh); After commit 96f1e0974575, we no longer hold the j_state_lock while iterating over the list of reserved handles in jbd2_journal_commit_transaction(). This potentially allows the journal_head to be freed by journal_unmap_buffer while the commit codepath is also trying to free the BJ_Reserved buffers. Keeping j_state_lock held while trying extends hold time of the lock minimally, and solves this issue. Fixes: 96f1e0974575("jbd2: avoid long hold times of j_state_lock while committing a transaction") Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220317142137.1821590-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/commit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index d188fa913a07..34b1406c06fd 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -501,7 +501,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) } spin_unlock(&commit_transaction->t_handle_lock); commit_transaction->t_state = T_SWITCH; - write_unlock(&journal->j_state_lock); J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); @@ -521,6 +520,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) * has reserved. This is consistent with the existing behaviour * that multiple jbd2_journal_get_write_access() calls to the same * buffer are perfectly permissible. + * We use journal->j_state_lock here to serialize processing of + * t_reserved_list with eviction of buffers from journal_unmap_buffer(). */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; @@ -540,6 +541,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_refile_buffer(journal, jh); } + write_unlock(&journal->j_state_lock); /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially -- cgit v1.2.3