From 0dff1b407def32452a0d80148172889862c64fe7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 16 Nov 2023 15:15:45 -0500 Subject: mm/pagemap: fix ioctl(PAGEMAP_SCAN) on vma check Patch series "mm/pagemap: A few fixes to the recent PAGEMAP_SCAN". This series should fix two known reports from syzbot on the new PAGEMAP_SCAN ioctl(): https://lore.kernel.org/all/000000000000b0e576060a30ee3b@google.com/ https://lore.kernel.org/all/000000000000773fa7060a31e2cc@google.com/ The 3rd patch is something I found when testing these patches. This patch (of 3): The new ioctl(PAGEMAP_SCAN) relies on vma wr-protect capability provided by userfault, however in the vma test it didn't explicitly require the vma to have wr-protect function enabled, even if PM_SCAN_WP_MATCHING flag is set. It means the pagemap code can now apply uffd-wp bit to a page in the vma even if not registered to userfaultfd at all. Then in whatever way as long as the pte got written and page fault resolved, we'll apply the write bit even if uffd-wp bit is set. We'll see a pte that has both UFFD_WP and WRITE bit set. Anything later that looks up the pte for uffd-wp bit will trigger the warning: WARNING: CPU: 1 PID: 5071 at arch/x86/include/asm/pgtable.h:403 pte_uffd_wp arch/x86/include/asm/pgtable.h:403 [inline] Fix it by doing proper check over the vma attributes when PM_SCAN_WP_MATCHING is specified. Link: https://lkml.kernel.org/r/20231116201547.536857-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20231116201547.536857-2-peterx@redhat.com Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs") Signed-off-by: Peter Xu Reported-by: syzbot+e94c5aaf7890901ebf9b@syzkaller.appspotmail.com Reviewed-by: David Hildenbrand Reviewed-by: Andrei Vagin Reviewed-by: Muhammad Usama Anjum Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ef2eb12906da..a6f88cbfb689 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1982,15 +1982,31 @@ static int pagemap_scan_test_walk(unsigned long start, unsigned long end, struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long vma_category = 0; + bool wp_allowed = userfaultfd_wp_async(vma) && + userfaultfd_wp_use_markers(vma); - if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma)) - vma_category |= PAGE_IS_WPALLOWED; - else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) - return -EPERM; + if (!wp_allowed) { + /* User requested explicit failure over wp-async capability */ + if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) + return -EPERM; + /* + * User requires wr-protect, and allows silently skipping + * unsupported vmas. + */ + if (p->arg.flags & PM_SCAN_WP_MATCHING) + return 1; + /* + * Then the request doesn't involve wr-protects at all, + * fall through to the rest checks, and allow vma walk. + */ + } if (vma->vm_flags & VM_PFNMAP) return 1; + if (wp_allowed) + vma_category |= PAGE_IS_WPALLOWED; + if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1; -- cgit v1.2.3 From 4980e837cab7e2acc4dad18dba656c6896c531aa Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 16 Nov 2023 15:15:46 -0500 Subject: mm/pagemap: fix wr-protect even if PM_SCAN_WP_MATCHING not set The new pagemap ioctl contains a fast path for wr-protections without looking into category masks. It forgets to check PM_SCAN_WP_MATCHING before applying the wr-protections. It can cause, e.g., pte markers installed on archs that do not even support uffd wr-protect. WARNING: CPU: 0 PID: 5059 at mm/memory.c:1520 zap_pte_range mm/memory.c:1520 [inline] Link: https://lkml.kernel.org/r/20231116201547.536857-3-peterx@redhat.com Fixes: 12f6b01a0bcb ("fs/proc/task_mmu: add fast paths to get/clear PAGE_IS_WRITTEN flag") Signed-off-by: Peter Xu Reported-by: syzbot+7ca4b2719dc742b8d0a4@syzkaller.appspotmail.com Reviewed-by: David Hildenbrand Reviewed-by: Andrei Vagin Cc: Muhammad Usama Anjum Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a6f88cbfb689..435b61054b5b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2156,7 +2156,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, return 0; } - if (!p->vec_out) { + if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { if (pte_uffd_wp(ptep_get(pte))) -- cgit v1.2.3 From eb66b8abae98f869c224f7c852b685ae02144564 Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Thu, 16 Nov 2023 11:13:52 +0800 Subject: squashfs: squashfs_read_data need to check if the length is 0 When the length passed in is 0, the pagemap_scan_test_walk() caller should bail. This error causes at least a WARN_ON(). Link: https://lkml.kernel.org/r/20231116031352.40853-1-lizhi.xu@windriver.com Reported-by: syzbot+32d3767580a1ea339a81@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/0000000000000526f2060a30a085@google.com Signed-off-by: Lizhi Xu Reviewed-by: Phillip Lougher Signed-off-by: Andrew Morton --- fs/squashfs/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 581ce9519339..2dc730800f44 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -321,7 +321,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2, compressed ? "" : "un", length); } - if (length < 0 || length > output->length || + if (length <= 0 || length > output->length || (index + length) > msblk->bytes_used) { res = -EIO; goto out; -- cgit v1.2.3 From d61d0ab573649789bf9eb909c89a1a193b2e3d10 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 29 Nov 2023 23:15:47 +0900 Subject: nilfs2: fix missing error check for sb_set_blocksize call When mounting a filesystem image with a block size larger than the page size, nilfs2 repeatedly outputs long error messages with stack traces to the kernel log, such as the following: getblk(): invalid block size 8192 requested logical block size: 512 ... Call Trace: dump_stack_lvl+0x92/0xd4 dump_stack+0xd/0x10 bdev_getblk+0x33a/0x354 __breadahead+0x11/0x80 nilfs_search_super_root+0xe2/0x704 [nilfs2] load_nilfs+0x72/0x504 [nilfs2] nilfs_mount+0x30f/0x518 [nilfs2] legacy_get_tree+0x1b/0x40 vfs_get_tree+0x18/0xc4 path_mount+0x786/0xa88 __ia32_sys_mount+0x147/0x1a8 __do_fast_syscall_32+0x56/0xc8 do_fast_syscall_32+0x29/0x58 do_SYSENTER_32+0x15/0x18 entry_SYSENTER_32+0x98/0xf1 ... This overloads the system logger. And to make matters worse, it sometimes crashes the kernel with a memory access violation. This is because the return value of the sb_set_blocksize() call, which should be checked for errors, is not checked. The latter issue is due to out-of-buffer memory being accessed based on a large block size that caused sb_set_blocksize() to fail for buffers read with the initial minimum block size that remained unupdated in the super_block structure. Since nilfs2 mkfs tool does not accept block sizes larger than the system page size, this has been overlooked. However, it is possible to create this situation by intentionally modifying the tool or by passing a filesystem image created on a system with a large page size to a system with a smaller page size and mounting it. Fix this issue by inserting the expected error handling for the call to sb_set_blocksize(). Link: https://lkml.kernel.org/r/20231129141547.4726-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/the_nilfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 0f0667957c81..71400496ed36 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -716,7 +716,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) goto failed_sbh; } nilfs_release_super_block(nilfs); - sb_set_blocksize(sb, blocksize); + if (!sb_set_blocksize(sb, blocksize)) { + nilfs_err(sb, "bad blocksize %d", blocksize); + err = -EINVAL; + goto out; + } err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); if (err) -- cgit v1.2.3 From 4a3ef6be03e6700037fc20e63aa5ffd972e435ca Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 4 Dec 2023 10:32:34 -0800 Subject: mm/hugetlb: have CONFIG_HUGETLB_PAGE select CONFIG_XARRAY_MULTI After commit a08c7193e4f1 "mm/filemap: remove hugetlb special casing in filemap.c", hugetlb pages are stored in the page cache in base page sized indexes. This leads to multi index stores in the xarray which is only supporting through CONFIG_XARRAY_MULTI. The other page cache user of multi index stores ,THP, selects XARRAY_MULTI. Have CONFIG_HUGETLB_PAGE follow this behavior as well to avoid the BUG() with a CONFIG_HUGETLB_PAGE && !CONFIG_XARRAY_MULTI config. Link: https://lkml.kernel.org/r/20231204183234.348697-1-sidhartha.kumar@oracle.com Fixes: a08c7193e4f1 ("mm/filemap: remove hugetlb special casing in filemap.c") Signed-off-by: Sidhartha Kumar Reported-by: Al Viro Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index fd1f655b4f1f..42837617a55b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -268,6 +268,7 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS + select XARRAY_MULTI config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE -- cgit v1.2.3 From 675abf8df1353e0e3bde314993e0796c524cfbf0 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 5 Dec 2023 17:59:47 +0900 Subject: nilfs2: prevent WARNING in nilfs_sufile_set_segment_usage() If nilfs2 reads a disk image with corrupted segment usage metadata, and its segment usage information is marked as an error for the segment at the write location, nilfs_sufile_set_segment_usage() can trigger WARN_ONs during log writing. Segments newly allocated for writing with nilfs_sufile_alloc() will not have this error flag set, but this unexpected situation will occur if the segment indexed by either nilfs->ns_segnum or nilfs->ns_nextnum (active segment) was marked in error. Fix this issue by inserting a sanity check to treat it as a file system corruption. Since error returns are not allowed during the execution phase where nilfs_sufile_set_segment_usage() is used, this inserts the sanity check into nilfs_sufile_mark_dirty() which pre-reads the buffer containing the segment usage record to be updated and sets it up in a dirty state for writing. In addition, nilfs_sufile_set_segment_usage() is also called when canceling log writing and undoing segment usage update, so in order to avoid issuing the same kernel warning in that case, in case of cancellation, avoid checking the error flag in nilfs_sufile_set_segment_usage(). Link: https://lkml.kernel.org/r/20231205085947.4431-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+14e9f834f6ddecece094@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=14e9f834f6ddecece094 Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/sufile.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 2c6078a6b8ec..58ca7c936393 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -501,15 +501,38 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); - if (!ret) { - mark_buffer_dirty(bh); - nilfs_mdt_mark_dirty(sufile); - kaddr = kmap_atomic(bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + if (ret) + goto out_sem; + + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + if (unlikely(nilfs_segment_usage_error(su))) { + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + + kunmap_atomic(kaddr); + brelse(bh); + if (nilfs_segment_is_active(nilfs, segnum)) { + nilfs_error(sufile->i_sb, + "active segment %llu is erroneous", + (unsigned long long)segnum); + } else { + /* + * Segments marked erroneous are never allocated by + * nilfs_sufile_alloc(); only active segments, ie, + * the segments indexed by ns_segnum or ns_nextnum, + * can be erroneous here. + */ + WARN_ON_ONCE(1); + } + ret = -EIO; + } else { nilfs_segment_usage_set_dirty(su); kunmap_atomic(kaddr); + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); brelse(bh); } +out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } @@ -536,9 +559,14 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, kaddr = kmap_atomic(bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); - WARN_ON(nilfs_segment_usage_error(su)); - if (modtime) + if (modtime) { + /* + * Check segusage error and set su_lastmod only when updating + * this entry with a valid timestamp, not for cancellation. + */ + WARN_ON_ONCE(nilfs_segment_usage_error(su)); su->su_lastmod = cpu_to_le64(modtime); + } su->su_nblocks = cpu_to_le32(nblocks); kunmap_atomic(kaddr); -- cgit v1.2.3