summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Gordeev <agordeev@linux.ibm.com>2021-11-04 09:14:44 +0300
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2022-01-27 13:05:10 +0300
commit9d28671017047af1ca14d00c946f02e0fd871ff4 (patch)
treeef6b61c3d4db53581410e51d56f98c40db810c43
parentb5498bbba0d3072675d0391ebcc28e618944b105 (diff)
downloadlinux-9d28671017047af1ca14d00c946f02e0fd871ff4.tar.xz
s390/mm: fix 2KB pgtable release race
commit c2c224932fd0ee6854d6ebfc8d059c2bcad86606 upstream. There is a race on concurrent 2KB-pgtables release paths when both upper and lower halves of the containing parent page are freed, one via page_table_free_rcu() + __tlb_remove_table(), and the other via page_table_free(). The race might lead to a corruption as result of remove of list item in page_table_free() concurrently with __free_page() in __tlb_remove_table(). Let's assume first the lower and next the upper 2KB-pgtables are freed from a page. Since both halves of the page are allocated the tracking byte (bits 24-31 of the page _refcount) has value of 0x03 initially: CPU0 CPU1 ---- ---- page_table_free_rcu() // lower half { // _refcount[31..24] == 0x03 ... atomic_xor_bits(&page->_refcount, 0x11U << (0 + 24)); // _refcount[31..24] <= 0x12 ... table = table | (1U << 0); tlb_remove_table(tlb, table); } ... __tlb_remove_table() { // _refcount[31..24] == 0x12 mask = _table & 3; // mask <= 0x01 ... page_table_free() // upper half { // _refcount[31..24] == 0x12 ... atomic_xor_bits( &page->_refcount, 1U << (1 + 24)); // _refcount[31..24] <= 0x10 // mask <= 0x10 ... atomic_xor_bits(&page->_refcount, mask << (4 + 24)); // _refcount[31..24] <= 0x00 // mask <= 0x00 ... if (mask != 0) // == false break; fallthrough; ... if (mask & 3) // == false ... else __free_page(page); list_del(&page->lru); ^^^^^^^^^^^^^^^^^^ RACE! ^^^^^^^^^^^^^^^^^^^^^ } ... } The problem is page_table_free() releases the page as result of lower nibble unset and __tlb_remove_table() observing zero too early. With this update page_table_free() will use the similar logic as page_table_free_rcu() + __tlb_remove_table(), and mark the fragment as pending for removal in the upper nibble until after the list_del(). In other words, the parent page is considered as unreferenced and safe to release only when the lower nibble is cleared already and unsetting a bit in upper nibble results in that nibble turned zero. Cc: stable@vger.kernel.org Suggested-by: Vlastimil Babka <vbabka@suse.com> Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Heiko Carstens <hca@linux.ibm.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--arch/s390/mm/pgalloc.c4
1 files changed, 3 insertions, 1 deletions
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 781965f7210e..91e478e09b54 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -244,13 +244,15 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
/* Free 2K page table fragment of a 4K page */
bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
+ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
if (mask & 3)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
+ mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+ mask >>= 24;
if (mask != 0)
return;
} else {