From a7605426666196c5a460dd3de6f8dac1d3c21f00 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 14 Jan 2022 14:05:19 -0800 Subject: mm: shmem: don't truncate page if memory failure happens The current behavior of memory failure is to truncate the page cache regardless of dirty or clean. If the page is dirty the later access will get the obsolete data from disk without any notification to the users. This may cause silent data loss. It is even worse for shmem since shmem is in-memory filesystem, truncating page cache means discarding data blocks. The later read would return all zero. The right approach is to keep the corrupted page in page cache, any later access would return error for syscalls or SIGBUS for page fault, until the file is truncated, hole punched or removed. The regular storage backed filesystems would be more complicated so this patch is focused on shmem. This also unblock the support for soft offlining shmem THP. [akpm@linux-foundation.org: coding style fixes] [arnd@arndb.de: fix uninitialized variable use in me_pagecache_clean()] Link: https://lkml.kernel.org/r/20211022064748.4173718-1-arnd@kernel.org [Fix invalid pointer dereference in shmem_read_mapping_page_gfp() with a slight different implementation from what Ajay Garg and Muchun Song proposed and reworked the error handling of shmem_write_begin() suggested by Linus] Link: https://lore.kernel.org/linux-mm/20211111084617.6746-1-ajaygargnsit@gmail.com/ Link: https://lkml.kernel.org/r/20211020210755.23964-6-shy828301@gmail.com Link: https://lkml.kernel.org/r/20211116193247.21102-1-shy828301@gmail.com Signed-off-by: Yang Shi Signed-off-by: Arnd Bergmann Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Cc: Ajay Garg Cc: Muchun Song Cc: Andy Lavr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 18f93c2d68f1..fb5152369926 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2457,6 +2457,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | @@ -2467,7 +2468,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + + if (ret) + return ret; + + if (PageHWPoison(*pagep)) { + unlock_page(*pagep); + put_page(*pagep); + *pagep = NULL; + return -EIO; + } + + return 0; } static int @@ -2554,6 +2567,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (sgp == SGP_CACHE) set_page_dirty(page); unlock_page(page); + + if (PageHWPoison(page)) { + put_page(page); + error = -EIO; + break; + } } /* @@ -3093,7 +3112,8 @@ static const char *shmem_get_link(struct dentry *dentry, page = find_get_page(inode->i_mapping, 0); if (!page) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { + if (PageHWPoison(page) || + !PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } @@ -3101,6 +3121,13 @@ static const char *shmem_get_link(struct dentry *dentry, error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); + if (!page) + return ERR_PTR(-ECHILD); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ECHILD); + } unlock_page(page); } set_delayed_call(done, shmem_put_link, page); @@ -3751,6 +3778,13 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, @@ -3761,7 +3795,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; EXPORT_SYMBOL(shmem_aops); @@ -4169,9 +4203,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) - page = ERR_PTR(error); - else - unlock_page(page); + return ERR_PTR(error); + + unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; #else /* -- cgit v1.2.3 From 62c9827cbb996c2c04f615ecd783ce28bcea894b Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 14 Jan 2022 14:05:23 -0800 Subject: shmem: fix a race between shmem_unused_huge_shrink and shmem_evict_inode Fix a data race in commit 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure"). Here are call traces causing race: Call Trace 1: shmem_unused_huge_shrink+0x3ae/0x410 ? __list_lru_walk_one.isra.5+0x33/0x160 super_cache_scan+0x17c/0x190 shrink_slab.part.55+0x1ef/0x3f0 shrink_node+0x10e/0x330 kswapd+0x380/0x740 kthread+0xfc/0x130 ? mem_cgroup_shrink_node+0x170/0x170 ? kthread_create_on_node+0x70/0x70 ret_from_fork+0x1f/0x30 Call Trace 2: shmem_evict_inode+0xd8/0x190 evict+0xbe/0x1c0 do_unlinkat+0x137/0x330 do_syscall_64+0x76/0x120 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 A simple explanation: Image there are 3 items in the local list (@list). In the first traversal, A is not deleted from @list. 1) A->B->C ^ | pos (leave) In the second traversal, B is deleted from @list. Concurrently, A is deleted from @list through shmem_evict_inode() since last reference counter of inode is dropped by other thread. Then the @list is corrupted. 2) A->B->C ^ ^ | | evict pos (drop) We should make sure the inode is either on the global list or deleted from any local list before iput(). Fixed by moving inodes back to global list before we put them. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/20211125064502.99983-1-ligang.bdlg@bytedance.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Gang Li Reviewed-by: Muchun Song Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index fb5152369926..8f940552c182 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -554,7 +554,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shmem_inode_info *info; struct page *page; unsigned long batch = sc ? sc->nr_to_scan : 128; - int removed = 0, split = 0; + int split = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -569,7 +569,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; } @@ -577,12 +576,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; } list_move(&info->shrinklist, &list); next: + sbinfo->shrinklist_len--; if (!--batch) break; } @@ -602,7 +601,7 @@ next: inode = &info->vfs_inode; if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back; page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); @@ -616,38 +615,44 @@ next: } /* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page); - goto leave; + goto move_back; } ret = split_huge_page(page); unlock_page(page); put_page(page); - /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back; split++; drop: list_del_init(&info->shrinklist); - removed++; -leave: + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: iput(inode); } - spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; } -- cgit v1.2.3 From be1a13eb51077b2ec5f7f4306f93dfece503a3f1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:27 -0800 Subject: mm: drop node from alloc_pages_vma alloc_pages_vma is meant to allocate a page with a vma specific memory policy. The initial node parameter is always a local node so it is pointless to waste a function argument for this. Drop the parameter. Link: https://lkml.kernel.org/r/YaSnlv4QpryEpesG@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 ++++---- mm/mempolicy.c | 3 ++- mm/shmem.c | 3 +-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/shmem.c') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8fcc38467af6..78b58448f796 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -598,9 +598,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order); struct folio *folio_alloc(gfp_t gfp, unsigned order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); + bool hugepage); #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + alloc_pages_vma(gfp_mask, order, vma, addr, true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -610,14 +610,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 679f47b3a079..ed7d15acb6a2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2084,9 +2084,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, bool hugepage) { struct mempolicy *pol; + int node = numa_node_id(); struct page *page; int preferred_nid; nodemask_t *nmask; diff --git a/mm/shmem.c b/mm/shmem.c index 8f940552c182..0700e9acf53b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1564,8 +1564,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), - true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); -- cgit v1.2.3