From d549cb65b7982ddec3ef17d398e7d893a9d3705f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 2 Nov 2017 15:59:50 -0700 Subject: mm, swap: fix race between swap count continuation operations commit 2628bd6fc052bd85e9864dae4de494d8a6313391 upstream. One page may store a set of entries of the sis->swap_map (swap_info_struct->swap_map) in multiple swap clusters. If some of the entries has sis->swap_map[offset] > SWAP_MAP_MAX, multiple pages will be used to store the set of entries of the sis->swap_map. And the pages are linked with page->lru. This is called swap count continuation. To access the pages which store the set of entries of the sis->swap_map simultaneously, previously, sis->lock is used. But to improve the scalability of __swap_duplicate(), swap cluster lock may be used in swap_count_continued() now. This may race with add_swap_count_continuation() which operates on a nearby swap cluster, in which the sis->swap_map entries are stored in the same page. The race can cause wrong swap count in practice, thus cause unfreeable swap entries or software lockup, etc. To fix the race, a new spin lock called cont_lock is added to struct swap_info_struct to protect the swap count continuation page list. This is a lock at the swap device level, so the scalability isn't very well. But it is still much better than the original sis->lock, because it is only acquired/released when swap count continuation is used. Which is considered rare in practice. If it turns out that the scalability becomes an issue for some workloads, we can split the lock into some more fine grained locks. Link: http://lkml.kernel.org/r/20171017081320.28133-1-ying.huang@intel.com Fixes: 235b62176712 ("mm/swap: add cluster lock") Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Shaohua Li Cc: Tim Chen Cc: Michal Hocko Cc: Aaron Lu Cc: Dave Hansen Cc: Andi Kleen Cc: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/swapfile.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index a8952b6563c6..3191465b0ccf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2635,6 +2635,7 @@ static struct swap_info_struct *alloc_swap_info(void) p->flags = SWP_USED; spin_unlock(&swap_lock); spin_lock_init(&p->lock); + spin_lock_init(&p->cont_lock); return p; } @@ -3307,6 +3308,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; + spin_lock(&si->cont_lock); /* * Page allocation does not initialize the page's lru field, * but it does always reset its private field. @@ -3326,7 +3328,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) * a continuation page, free our allocation and use this one. */ if (!(count & COUNT_CONTINUED)) - goto out; + goto out_unlock_cont; map = kmap_atomic(list_page) + offset; count = *map; @@ -3337,11 +3339,13 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) * free our allocation and use this one. */ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) - goto out; + goto out_unlock_cont; } list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ +out_unlock_cont: + spin_unlock(&si->cont_lock); out: unlock_cluster(ci); spin_unlock(&si->lock); @@ -3366,6 +3370,7 @@ static bool swap_count_continued(struct swap_info_struct *si, struct page *head; struct page *page; unsigned char *map; + bool ret; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head) != SWP_CONTINUED) { @@ -3373,6 +3378,7 @@ static bool swap_count_continued(struct swap_info_struct *si, return false; /* need to add count continuation */ } + spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; page = list_entry(head->lru.next, struct page, lru); map = kmap_atomic(page) + offset; @@ -3393,8 +3399,10 @@ static bool swap_count_continued(struct swap_info_struct *si, if (*map == SWAP_CONT_MAX) { kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); - if (page == head) - return false; /* add count continuation */ + if (page == head) { + ret = false; /* add count continuation */ + goto out; + } map = kmap_atomic(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } @@ -3407,7 +3415,7 @@ init_map: *map = 0; /* we didn't zero the page */ kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } - return true; /* incremented */ + ret = true; /* incremented */ } else { /* decrementing */ /* @@ -3433,8 +3441,11 @@ init_map: *map = 0; /* we didn't zero the page */ kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } - return count == COUNT_CONTINUED; + ret = count == COUNT_CONTINUED; } +out: + spin_unlock(&si->cont_lock); + return ret; } /* -- cgit v1.2.3