From 5956592ce337330cdff0399a6f8b6a5aea397a8e Mon Sep 17 00:00:00 2001 From: Qian Yingjin Date: Wed, 8 Feb 2023 10:24:00 +0800 Subject: mm/filemap: fix page end in filemap_get_read_batch I was running traces of the read code against an RAID storage system to understand why read requests were being misaligned against the underlying RAID strips. I found that the page end offset calculation in filemap_get_read_batch() was off by one. When a read is submitted with end offset 1048575, then it calculates the end page for read of 256 when it should be 255. "last_index" is the index of the page beyond the end of the read and it should be skipped when get a batch of pages for read in @filemap_get_read_batch(). The below simple patch fixes the problem. This code was introduced in kernel 5.12. Link: https://lkml.kernel.org/r/20230208022400.28962-1-coolqyj@163.com Fixes: cbd59c48ae2b ("mm/filemap: use head pages in generic_file_buffered_read") Signed-off-by: Qian Yingjin Reviewed-by: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- mm/filemap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index c4d4ace9cc70..0e20a8d6dd93 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2588,18 +2588,19 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio; int err = 0; + /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); retry: if (fatal_signal_pending(current)) return -EINTR; - filemap_get_read_batch(mapping, index, last_index, fbatch); + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - filemap_get_read_batch(mapping, index, last_index, fbatch); + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) -- cgit v1.2.3 From ae63c898f4004bbc7d212f4adcb3bb14852c30d6 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Tue, 24 Jan 2023 17:57:37 -0800 Subject: mm/MADV_COLLAPSE: set EAGAIN on unexpected page refcount During collapse, in a few places we check to see if a given small page has any unaccounted references. If the refcount on the page doesn't match our expectations, it must be there is an unknown user concurrently interested in the page, and so it's not safe to move the contents elsewhere. However, the unaccounted pins are likely an ephemeral state. In this situation, MADV_COLLAPSE returns -EINVAL when it should return -EAGAIN. This could cause userspace to conclude that the syscall failed, when it in fact could succeed by retrying. Link: https://lkml.kernel.org/r/20230125015738.912924-1-zokeefe@google.com Fixes: 7d8faaf15545 ("mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse") Signed-off-by: Zach O'Keefe Reported-by: Hugh Dickins Acked-by: Hugh Dickins Reviewed-by: Yang Shi Cc: Signed-off-by: Andrew Morton --- mm/khugepaged.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 90acfea40c13..a26a28e3738c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2611,6 +2611,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_CGROUP_CHARGE_FAIL: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: -- cgit v1.2.3 From 5c7388bcd59df59fe527246526bbdf8b9dedf8cc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 14 Feb 2023 09:02:08 -0800 Subject: MAINTAINERS: update FPU EMULATOR web page The web page entry for the FPU EMULATOR no longer works. I notified Bill of this and he asked me to update it to this new entry. Link: https://lkml.kernel.org/r/20230214170208.17287-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Bill Metzenthen Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 123216b76534..f4267d19749a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8196,7 +8196,7 @@ F: drivers/fpga/microchip-spi.c FPU EMULATOR M: Bill Metzenthen S: Maintained -W: http://floatingpoint.sourceforge.net/emulator/index.html +W: https://floatingpoint.billm.au/ F: arch/x86/math-emu/ FRAMEBUFFER CORE -- cgit v1.2.3 From 96a9c287e25d690fd9623b5133703b8e310fbed1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 16 Feb 2023 10:30:59 -0500 Subject: mm/migrate: fix wrongly apply write bit after mkdirty on sparc64 Nick Bowler reported another sparc64 breakage after the young/dirty persistent work for page migration (per "Link:" below). That's after a similar report [2]. It turns out page migration was overlooked, and it wasn't failing before because page migration was not enabled in the initial report test environment. David proposed another way [2] to fix this from sparc64 side, but that patch didn't land somehow. Neither did I check whether there's any other arch that has similar issues. Let's fix it for now as simple as moving the write bit handling to be after dirty, like what we did before. Note: this is based on mm-unstable, because the breakage was since 6.1 and we're at a very late stage of 6.2 (-rc8), so I assume for this specific case we should target this at 6.3. [1] https://lore.kernel.org/all/20221021160603.GA23307@u164.east.ru/ [2] https://lore.kernel.org/all/20221212130213.136267-1-david@redhat.com/ Link: https://lkml.kernel.org/r/20230216153059.256739-1-peterx@redhat.com Fixes: 2e3468778dbe ("mm: remember young/dirty bit for page migrations") Link: https://lore.kernel.org/all/CADyTPExpEqaJiMGoV+Z6xVgL50ZoMJg49B10LcZ=8eg19u34BA@mail.gmail.com/ Signed-off-by: Peter Xu Reported-by: Nick Bowler Acked-by: David Hildenbrand Tested-by: Nick Bowler Cc: Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 6 ++++-- mm/migrate.c | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index abe6cfd92ffa..1b791b26d72d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3272,8 +3272,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); - if (is_writable_migration_entry(entry)) - pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); if (!is_migration_entry_young(entry)) @@ -3281,6 +3279,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) /* NOTE: this may contain setting soft-dirty on some archs */ if (PageDirty(new) && is_migration_entry_dirty(entry)) pmde = pmd_mkdirty(pmde); + if (is_writable_migration_entry(entry)) + pmde = maybe_pmd_mkwrite(pmde, vma); + else + pmde = pmd_wrprotect(pmde); if (PageAnon(new)) { rmap_t rmap_flags = RMAP_COMPOUND; diff --git a/mm/migrate.c b/mm/migrate.c index a4d3fc65085f..cc5455614e01 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -224,6 +224,8 @@ static bool remove_migration_pte(struct folio *folio, pte = maybe_mkwrite(pte, vma); else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); + else + pte = pte_wrprotect(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; -- cgit v1.2.3 From ec4288fe63966b26d53907212ecd05dfa81dd2cc Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Feb 2023 17:35:42 -0800 Subject: hugetlb: check for undefined shift on 32 bit architectures Users can specify the hugetlb page size in the mmap, shmget and memfd_create system calls. This is done by using 6 bits within the flags argument to encode the base-2 logarithm of the desired page size. The routine hstate_sizelog() uses the log2 value to find the corresponding hugetlb hstate structure. Converting the log2 value (page_size_log) to potential hugetlb page size is the simple statement: 1UL << page_size_log Because only 6 bits are used for page_size_log, the left shift can not be greater than 63. This is fine on 64 bit architectures where a long is 64 bits. However, if a value greater than 31 is passed on a 32 bit architecture (where long is 32 bits) the shift will result in undefined behavior. This was generally not an issue as the result of the undefined shift had to exactly match hugetlb page size to proceed. Recent improvements in runtime checking have resulted in this undefined behavior throwing errors such as reported below. Fix by comparing page_size_log to BITS_PER_LONG before doing shift. Link: https://lkml.kernel.org/r/20230216013542.138708-1-mike.kravetz@oracle.com Link: https://lore.kernel.org/lkml/CA+G9fYuei_Tr-vN9GS7SfFyU1y9hNysnf=PB7kT0=yv4MiPgVg@mail.gmail.com/ Fixes: 42d7395feb56 ("mm: support more pagesizes for MAP_HUGETLB/SHM_HUGETLB") Signed-off-by: Mike Kravetz Reported-by: Naresh Kamboju Reviewed-by: Jesper Juhl Acked-by: Muchun Song Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Cc: Anders Roxell Cc: Andi Kleen Cc: Sasha Levin Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index db194e2ba69f..9ab9d3105d5c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -743,7 +743,10 @@ static inline struct hstate *hstate_sizelog(int page_size_log) if (!page_size_log) return &default_hstate; - return size_to_hstate(1UL << page_size_log); + if (page_size_log < BITS_PER_LONG) + return size_to_hstate(1UL << page_size_log); + + return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) -- cgit v1.2.3 From 99b9402a36f0799f25feee4465bfa4b8dfa74b4d Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 15 Feb 2023 07:40:43 +0900 Subject: nilfs2: fix underflow in second superblock position calculations Macro NILFS_SB2_OFFSET_BYTES, which computes the position of the second superblock, underflows when the argument device size is less than 4096 bytes. Therefore, when using this macro, it is necessary to check in advance that the device size is not less than a lower limit, or at least that underflow does not occur. The current nilfs2 implementation lacks this check, causing out-of-bound block access when mounting devices smaller than 4096 bytes: I/O error, dev loop0, sector 36028797018963960 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2 NILFS (loop0): unable to read secondary superblock (blocksize = 1024) In addition, when trying to resize the filesystem to a size below 4096 bytes, this underflow occurs in nilfs_resize_fs(), passing a huge number of segments to nilfs_sufile_resize(), corrupting parameters such as the number of segments in superblocks. This causes excessive loop iterations in nilfs_sufile_resize() during a subsequent resize ioctl, causing semaphore ns_segctor_sem to block for a long time and hang the writer thread: INFO: task segctord:5067 blocked for more than 143 seconds. Not tainted 6.2.0-rc8-syzkaller-00015-gf6feea56f66d #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:segctord state:D stack:23456 pid:5067 ppid:2 flags:0x00004000 Call Trace: context_switch kernel/sched/core.c:5293 [inline] __schedule+0x1409/0x43f0 kernel/sched/core.c:6606 schedule+0xc3/0x190 kernel/sched/core.c:6682 rwsem_down_write_slowpath+0xfcf/0x14a0 kernel/locking/rwsem.c:1190 nilfs_transaction_lock+0x25c/0x4f0 fs/nilfs2/segment.c:357 nilfs_segctor_thread_construct fs/nilfs2/segment.c:2486 [inline] nilfs_segctor_thread+0x52f/0x1140 fs/nilfs2/segment.c:2570 kthread+0x270/0x300 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 ... Call Trace: folio_mark_accessed+0x51c/0xf00 mm/swap.c:515 __nilfs_get_page_block fs/nilfs2/page.c:42 [inline] nilfs_grab_buffer+0x3d3/0x540 fs/nilfs2/page.c:61 nilfs_mdt_submit_block+0xd7/0x8f0 fs/nilfs2/mdt.c:121 nilfs_mdt_read_block+0xeb/0x430 fs/nilfs2/mdt.c:176 nilfs_mdt_get_block+0x12d/0xbb0 fs/nilfs2/mdt.c:251 nilfs_sufile_get_segment_usage_block fs/nilfs2/sufile.c:92 [inline] nilfs_sufile_truncate_range fs/nilfs2/sufile.c:679 [inline] nilfs_sufile_resize+0x7a3/0x12b0 fs/nilfs2/sufile.c:777 nilfs_resize_fs+0x20c/0xed0 fs/nilfs2/super.c:422 nilfs_ioctl_resize fs/nilfs2/ioctl.c:1033 [inline] nilfs_ioctl+0x137c/0x2440 fs/nilfs2/ioctl.c:1301 ... This fixes these issues by inserting appropriate minimum device size checks or anti-underflow checks, depending on where the macro is used. Link: https://lkml.kernel.org/r/0000000000004e1dfa05f4a48e6b@google.com Link: https://lkml.kernel.org/r/20230214224043.24141-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/ioctl.c | 7 +++++++ fs/nilfs2/super.c | 9 +++++++++ fs/nilfs2/the_nilfs.c | 8 +++++++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 87e1004b606d..b4041d0566a9 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1114,7 +1114,14 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) minseg = range[0] + segbytes - 1; do_div(minseg, segbytes); + + if (range[1] < 4096) + goto out; + maxseg = NILFS_SB2_OFFSET_BYTES(range[1]); + if (maxseg < segbytes) + goto out; + do_div(maxseg, segbytes); maxseg--; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6edb6e0dd61f..1422b8ba24ed 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -408,6 +408,15 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) if (newsize > devsize) goto out; + /* + * Prevent underflow in second superblock position calculation. + * The exact minimum size check is done in nilfs_sufile_resize(). + */ + if (newsize < 4096) { + ret = -ENOSPC; + goto out; + } + /* * Write lock is required to protect some functions depending * on the number of segments, the number of reserved segments, diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2064e6473d30..3a4c9c150cbf 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -544,9 +544,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, { struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; - u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev)); + u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev); int valid[2], swp = 0; + if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) { + nilfs_err(sb, "device size too small"); + return -EINVAL; + } + sb2off = NILFS_SB2_OFFSET_BYTES(devsize); + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, &sbh[0]); sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); -- cgit v1.2.3