From c69307d533d7aa7cc8894dbbb8a274599f8630d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:28:41 +0100 Subject: sched/numa: Fix comments Fix a 80 column violation and a PTE vs PMD reference. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-4-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884682d8..19dbb08c64a5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1305,7 +1305,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); lock_page(page); - /* Confirm the PTE did not while locked */ + /* Confirm the PMD did not change while page_table_lock was released */ spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); -- cgit v1.2.3 From 0c3a775e1e0b069bf765f8355b723ce0d18dcc6c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:42 +0100 Subject: mm: numa: Do not account for a hinting fault if we raced If another task handled a hinting fault in parallel then do not double account for it. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-5-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 19dbb08c64a5..dab2bab9d33e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1325,8 +1325,11 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, check_same: spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) + if (unlikely(!pmd_same(pmd, *pmdp))) { + /* Someone else took our fault */ + current_nid = -1; goto out_unlock; + } clear_pmdnuma: pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); -- cgit v1.2.3 From ff9042b11a71c81238c70af168cd36b98a6d5a3c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:43 +0100 Subject: mm: Wait for THP migrations to complete during NUMA hinting faults The locking for migrating THP is unusual. While normal page migration prevents parallel accesses using a migration PTE, THP migration relies on a combination of the page_table_lock, the page lock and the existance of the NUMA hinting PTE to guarantee safety but there is a bug in the scheme. If a THP page is currently being migrated and another thread traps a fault on the same page it checks if the page is misplaced. If it is not, then pmd_numa is cleared. The problem is that it checks if the page is misplaced without holding the page lock meaning that the racing thread can be migrating the THP when the second thread clears the NUMA bit and faults a stale page. This patch checks if the page is potentially being migrated and stalls using the lock_page if it is potentially being migrated before checking if the page is misplaced or not. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-6-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dab2bab9d33e..f362363c0fad 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1295,13 +1295,14 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (current_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { - put_page(page); - goto clear_pmdnuma; - } + /* + * Acquire the page lock to serialise THP migrations but avoid dropping + * page_table_lock if at all possible + */ + if (trylock_page(page)) + goto got_lock; - /* Acquire the page lock to serialise THP migrations */ + /* Serialise against migrationa and check placement check placement */ spin_unlock(&mm->page_table_lock); lock_page(page); @@ -1312,9 +1313,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(page); goto out_unlock; } - spin_unlock(&mm->page_table_lock); + +got_lock: + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + unlock_page(page); + put_page(page); + goto clear_pmdnuma; + } /* Migrate the THP to the requested node */ + spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); if (!migrated) -- cgit v1.2.3 From b8916634b77bffb233d8f2f45703c80343457cc1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:44 +0100 Subject: mm: Prevent parallel splits during THP migration THP migrations are serialised by the page lock but on its own that does not prevent THP splits. If the page is split during THP migration then the pmd_same checks will prevent page table corruption but the unlock page and other fix-ups potentially will cause corruption. This patch takes the anon_vma lock to prevent parallel splits during migration. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-7-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f362363c0fad..1d6334fc8b6d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1278,18 +1278,18 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { + struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; int current_nid = -1; - bool migrated; + bool migrated, page_locked; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); - get_page(page); current_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (current_nid == numa_node_id()) @@ -1299,12 +1299,29 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Acquire the page lock to serialise THP migrations but avoid dropping * page_table_lock if at all possible */ - if (trylock_page(page)) - goto got_lock; + page_locked = trylock_page(page); + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + /* If the page was locked, there are no parallel migrations */ + if (page_locked) { + unlock_page(page); + goto clear_pmdnuma; + } - /* Serialise against migrationa and check placement check placement */ + /* Otherwise wait for potential migrations and retry fault */ + spin_unlock(&mm->page_table_lock); + wait_on_page_locked(page); + goto out; + } + + /* Page is misplaced, serialise migrations and parallel THP splits */ + get_page(page); spin_unlock(&mm->page_table_lock); - lock_page(page); + if (!page_locked) { + lock_page(page); + page_locked = true; + } + anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ spin_lock(&mm->page_table_lock); @@ -1314,14 +1331,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; } -got_lock: - target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { - unlock_page(page); - put_page(page); - goto clear_pmdnuma; - } - /* Migrate the THP to the requested node */ spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, @@ -1330,6 +1339,8 @@ got_lock: goto check_same; task_numa_fault(target_nid, HPAGE_PMD_NR, true); + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); return 0; check_same: @@ -1346,6 +1357,11 @@ clear_pmdnuma: update_mmu_cache_pmd(vma, addr, pmdp); out_unlock: spin_unlock(&mm->page_table_lock); + +out: + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); + if (current_nid != -1) task_numa_fault(current_nid, HPAGE_PMD_NR, false); return 0; -- cgit v1.2.3 From 8191acbd30c73e45c24ad16c372e0b42cc7ac8f8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:45 +0100 Subject: mm: numa: Sanitize task_numa_fault() callsites There are three callers of task_numa_fault(): - do_huge_pmd_numa_page(): Accounts against the current node, not the node where the page resides, unless we migrated, in which case it accounts against the node we migrated to. - do_numa_page(): Accounts against the current node, not the node where the page resides, unless we migrated, in which case it accounts against the node we migrated to. - do_pmd_numa_page(): Accounts not at all when the page isn't migrated, otherwise accounts against the node we migrated towards. This seems wrong to me; all three sites should have the same sementaics, furthermore we should accounts against where the page really is, we already know where the task is. So modify all three sites to always account; we did after all receive the fault; and always account to where the page is after migration, regardless of success. They all still differ on when they clear the PTE/PMD; ideally that would get sorted too. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-8-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 25 +++++++++++++------------ mm/memory.c | 53 +++++++++++++++++++++-------------------------------- 2 files changed, 34 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1d6334fc8b6d..c3bb65f284d5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1281,18 +1281,19 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; + int page_nid = -1, this_nid = numa_node_id(); int target_nid; - int current_nid = -1; - bool migrated, page_locked; + bool page_locked; + bool migrated = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); - current_nid = page_to_nid(page); + page_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) + if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); /* @@ -1335,19 +1336,18 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); - if (!migrated) + if (migrated) + page_nid = target_nid; + else goto check_same; - task_numa_fault(target_nid, HPAGE_PMD_NR, true); - if (anon_vma) - page_unlock_anon_vma_read(anon_vma); - return 0; + goto out; check_same: spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) { /* Someone else took our fault */ - current_nid = -1; + page_nid = -1; goto out_unlock; } clear_pmdnuma: @@ -1362,8 +1362,9 @@ out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); - if (current_nid != -1) - task_numa_fault(current_nid, HPAGE_PMD_NR, false); + if (page_nid != -1) + task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); + return 0; } diff --git a/mm/memory.c b/mm/memory.c index ca0003947115..42ae82ee04c1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3519,12 +3519,12 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int current_nid) + unsigned long addr, int page_nid) { get_page(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) + if (page_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); return mpol_misplaced(page, vma, addr); @@ -3535,7 +3535,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page = NULL; spinlock_t *ptl; - int current_nid = -1; + int page_nid = -1; int target_nid; bool migrated = false; @@ -3565,15 +3565,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - current_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, current_nid); + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { - /* - * Account for the fault against the current node if it not - * being replaced regardless of where the page is located. - */ - current_nid = numa_node_id(); put_page(page); goto out; } @@ -3581,11 +3576,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate to the requested node */ migrated = migrate_misplaced_page(page, target_nid); if (migrated) - current_nid = target_nid; + page_nid = target_nid; out: - if (current_nid != -1) - task_numa_fault(current_nid, 1, migrated); + if (page_nid != -1) + task_numa_fault(page_nid, 1, migrated); return 0; } @@ -3600,7 +3595,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; - int local_nid = numa_node_id(); spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3623,9 +3617,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { pte_t pteval = *pte; struct page *page; - int curr_nid = local_nid; + int page_nid = -1; int target_nid; - bool migrated; + bool migrated = false; + if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3647,25 +3642,19 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(page_mapcount(page) != 1)) continue; - /* - * Note that the NUMA fault is later accounted to either - * the node that is currently running or where the page is - * migrated to. - */ - curr_nid = local_nid; - target_nid = numa_migrate_prep(page, vma, addr, - page_to_nid(page)); - if (target_nid == -1) { + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid); + pte_unmap_unlock(pte, ptl); + if (target_nid != -1) { + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) + page_nid = target_nid; + } else { put_page(page); - continue; } - /* Migrate to the requested node */ - pte_unmap_unlock(pte, ptl); - migrated = migrate_misplaced_page(page, target_nid); - if (migrated) - curr_nid = target_nid; - task_numa_fault(curr_nid, 1, migrated); + if (page_nid != -1) + task_numa_fault(page_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit v1.2.3 From a54a407fbf7735fd8f7841375574f5d9b0375f93 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:46 +0100 Subject: mm: Close races between THP migration and PMD numa clearing THP migration uses the page lock to guard against parallel allocations but there are cases like this still open Task A Task B --------------------- --------------------- do_huge_pmd_numa_page do_huge_pmd_numa_page lock_page mpol_misplaced == -1 unlock_page goto clear_pmdnuma lock_page mpol_misplaced == 2 migrate_misplaced_transhuge pmd = pmd_mknonnuma set_pmd_at During hours of testing, one crashed with weird errors and while I have no direct evidence, I suspect something like the race above happened. This patch extends the page lock to being held until the pmd_numa is cleared to prevent migration starting in parallel while the pmd_numa is being cleared. It also flushes the old pmd entry and orders pagetable insertion before rmap insertion. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 33 +++++++++++++++------------------ mm/migrate.c | 19 +++++++++++-------- 2 files changed, 26 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c3bb65f284d5..d4928769680f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1304,24 +1304,25 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, target_nid = mpol_misplaced(page, vma, haddr); if (target_nid == -1) { /* If the page was locked, there are no parallel migrations */ - if (page_locked) { - unlock_page(page); + if (page_locked) goto clear_pmdnuma; - } - /* Otherwise wait for potential migrations and retry fault */ + /* + * Otherwise wait for potential migrations and retry. We do + * relock and check_same as the page may no longer be mapped. + * As the fault is being retried, do not account for it. + */ spin_unlock(&mm->page_table_lock); wait_on_page_locked(page); + page_nid = -1; goto out; } /* Page is misplaced, serialise migrations and parallel THP splits */ get_page(page); spin_unlock(&mm->page_table_lock); - if (!page_locked) { + if (!page_locked) lock_page(page); - page_locked = true; - } anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ @@ -1329,32 +1330,28 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); put_page(page); + page_nid = -1; goto out_unlock; } - /* Migrate the THP to the requested node */ + /* + * Migrate the THP to the requested node, returns with page unlocked + * and pmd_numa cleared. + */ spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); if (migrated) page_nid = target_nid; - else - goto check_same; goto out; - -check_same: - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) { - /* Someone else took our fault */ - page_nid = -1; - goto out_unlock; - } clear_pmdnuma: + BUG_ON(!PageLocked(page)); pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); + unlock_page(page); out_unlock: spin_unlock(&mm->page_table_lock); diff --git a/mm/migrate.c b/mm/migrate.c index a26bccd44ccb..7bd90d3b16bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1713,12 +1713,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unlock_page(new_page); put_page(new_page); /* Free it */ - unlock_page(page); + /* Retake the callers reference and putback on LRU */ + get_page(page); putback_lru_page(page); - - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - isolated = 0; - goto out; + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); + goto out_fail; } /* @@ -1735,9 +1735,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); - page_add_new_anon_rmap(new_page, vma, haddr); - + pmdp_clear_flush(vma, haddr, pmd); set_pmd_at(mm, haddr, pmd, entry); + page_add_new_anon_rmap(new_page, vma, haddr); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(page); /* @@ -1756,7 +1756,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); -out: mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); @@ -1765,6 +1764,10 @@ out: out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: + entry = pmd_mknonnuma(entry); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + unlock_page(page); put_page(page); return 0; -- cgit v1.2.3 From afcae2655b0ab67e65f161b1bb214efcfa1db415 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:47 +0100 Subject: mm: Account for a THP NUMA hinting update as one PTE update A THP PMD update is accounted for as 512 pages updated in vmstat. This is large difference when estimating the cost of automatic NUMA balancing and can be misleading when comparing results that had collapsed versus split THP. This patch addresses the accounting issue. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-10-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b43..2bbb648ea73f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -145,7 +145,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, split_huge_page_pmd(vma, addr, pmd); else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { - pages += HPAGE_PMD_NR; + pages++; continue; } /* fall through */ -- cgit v1.2.3 From e920e14ca29b0b2a981cfc90e4e20edd6f078d19 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:48 +0100 Subject: mm: Do not flush TLB during protection change if !pte_present && !migration_entry NUMA PTE scanning is expensive both in terms of the scanning itself and the TLB flush if there are any updates. Currently non-present PTEs are accounted for as an update and incurring a TLB flush where it is only necessary for anonymous migration entries. This patch addresses the problem and should reduce TLB flushes. Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Reviewed-by: Rik van Riel Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-11-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/mprotect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 2bbb648ea73f..7bdbd4b0f6d9 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -101,8 +101,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, make_migration_entry_read(&entry); set_pte_at(mm, addr, pte, swp_entry_to_pte(entry)); + + pages++; } - pages++; } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); -- cgit v1.2.3 From f123d74abf91574837d14e5ea58f6a779a387bf5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:49 +0100 Subject: mm: Only flush TLBs if a transhuge PMD is modified for NUMA pte scanning NUMA PTE scanning is expensive both in terms of the scanning itself and the TLB flush if there are any updates. The TLB flush is avoided if no PTEs are updated but there is a bug where transhuge PMDs are considered to be updated even if they were already pmd_numa. This patch addresses the problem and TLB flushes should be reduced. Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Reviewed-by: Rik van Riel Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-12-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 19 ++++++++++++++++--- mm/mprotect.c | 14 ++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d4928769680f..de8d5cfc2bf2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1458,6 +1458,12 @@ out: return ret; } +/* + * Returns + * - 0 if PMD could not be locked + * - 1 if PMD was locked but protections unchange and TLB flush unnecessary + * - HPAGE_PMD_NR is protections changed and TLB flush necessary + */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa) { @@ -1466,9 +1472,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; - entry = pmdp_get_and_clear(mm, addr, pmd); + ret = 1; if (!prot_numa) { + entry = pmdp_get_and_clear(mm, addr, pmd); entry = pmd_modify(entry, newprot); + ret = HPAGE_PMD_NR; BUG_ON(pmd_write(entry)); } else { struct page *page = pmd_page(*pmd); @@ -1476,12 +1484,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, /* only check non-shared pages */ if (page_mapcount(page) == 1 && !pmd_numa(*pmd)) { + entry = pmdp_get_and_clear(mm, addr, pmd); entry = pmd_mknuma(entry); + ret = HPAGE_PMD_NR; } } - set_pmd_at(mm, addr, pmd, entry); + + /* Set PMD if cleared earlier */ + if (ret == HPAGE_PMD_NR) + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); - ret = 1; } return ret; diff --git a/mm/mprotect.c b/mm/mprotect.c index 7bdbd4b0f6d9..2da33dca6134 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -144,10 +144,16 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot, - prot_numa)) { - pages++; - continue; + else { + int nr_ptes = change_huge_pmd(vma, pmd, addr, + newprot, prot_numa); + + if (nr_ptes) { + if (nr_ptes == HPAGE_PMD_NR) + pages++; + + continue; + } } /* fall through */ } -- cgit v1.2.3 From a1a46184e34cfd0764f06a54870defa052b0a094 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:50 +0100 Subject: mm: numa: Do not migrate or account for hinting faults on the zero page The zero page is not replicated between nodes and is often shared between processes. The data is read-only and likely to be cached in local CPUs if heavily accessed meaning that the remote memory access cost is less of a concern. This patch prevents trapping faults on the zero pages. For tasks using the zero page this will reduce the number of PTE updates, TLB flushes and hinting faults. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju [ Correct use of is_huge_zero_page] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-13-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 10 +++++++++- mm/memory.c | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de8d5cfc2bf2..8677dbf31c2e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1291,6 +1291,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; page = pmd_page(pmd); + BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == this_nid) @@ -1481,8 +1482,15 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } else { struct page *page = pmd_page(*pmd); - /* only check non-shared pages */ + /* + * Only check non-shared pages. Do not trap faults + * against the zero page. The read-only data is likely + * to be read-cached on the local CPU cache and it is + * less useful to know about local vs remote hits on + * the zero page. + */ if (page_mapcount(page) == 1 && + !is_huge_zero_page(page) && !pmd_numa(*pmd)) { entry = pmdp_get_and_clear(mm, addr, pmd); entry = pmd_mknuma(entry); diff --git a/mm/memory.c b/mm/memory.c index 42ae82ee04c1..ed51f15136ee 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3564,6 +3564,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(ptep, ptl); return 0; } + BUG_ON(is_zero_pfn(page_to_pfn(page))); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); -- cgit v1.2.3 From ac8e895bd260cb8bb19ade6a3abd44e7abe9a01d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:03 +0100 Subject: sched/numa: Add infrastructure for split shared/private accounting of NUMA hinting faults Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. This patch prepares infrastructure for separately accounting shared and private faults by allocating the necessary buffers and passing in relevant information. For now, all faults are treated as private and detection will be introduced later. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-26-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++-- kernel/sched/fair.c | 46 +++++++++++++++++++++++++++++++++++----------- mm/huge_memory.c | 5 +++-- mm/memory.c | 8 ++++++-- 4 files changed, 47 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/include/linux/sched.h b/include/linux/sched.h index aecdc5a18773..d946195eec10 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1445,10 +1445,11 @@ struct task_struct { #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages, bool migrated); +extern void task_numa_fault(int last_node, int node, int pages, bool migrated); extern void set_numabalancing_state(bool enabled); #else -static inline void task_numa_fault(int node, int pages, bool migrated) +static inline void task_numa_fault(int last_node, int node, int pages, + bool migrated) { } static inline void set_numabalancing_state(bool enabled) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8b15e9e1d1b8..89eeb89fd99a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -886,6 +886,20 @@ static unsigned int task_scan_max(struct task_struct *p) */ unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; +static inline int task_faults_idx(int nid, int priv) +{ + return 2 * nid + priv; +} + +static inline unsigned long task_faults(struct task_struct *p, int nid) +{ + if (!p->numa_faults) + return 0; + + return p->numa_faults[task_faults_idx(nid, 0)] + + p->numa_faults[task_faults_idx(nid, 1)]; +} + static unsigned long weighted_cpuload(const int cpu); @@ -928,13 +942,19 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { unsigned long faults; + int priv, i; - /* Decay existing window and copy faults since last scan */ - p->numa_faults[nid] >>= 1; - p->numa_faults[nid] += p->numa_faults_buffer[nid]; - p->numa_faults_buffer[nid] = 0; + for (priv = 0; priv < 2; priv++) { + i = task_faults_idx(nid, priv); - faults = p->numa_faults[nid]; + /* Decay existing window, copy faults since last scan */ + p->numa_faults[i] >>= 1; + p->numa_faults[i] += p->numa_faults_buffer[i]; + p->numa_faults_buffer[i] = 0; + } + + /* Find maximum private faults */ + faults = p->numa_faults[task_faults_idx(nid, 1)]; if (faults > max_faults) { max_faults = faults; max_nid = nid; @@ -970,16 +990,20 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int node, int pages, bool migrated) +void task_numa_fault(int last_nid, int node, int pages, bool migrated) { struct task_struct *p = current; + int priv; if (!numabalancing_enabled) return; + /* For now, do not attempt to detect private/shared accesses */ + priv = 1; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { - int size = sizeof(*p->numa_faults) * nr_node_ids; + int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; /* numa_faults and numa_faults_buffer share the allocation */ p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); @@ -987,7 +1011,7 @@ void task_numa_fault(int node, int pages, bool migrated) return; BUG_ON(p->numa_faults_buffer); - p->numa_faults_buffer = p->numa_faults + nr_node_ids; + p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); } /* @@ -1005,7 +1029,7 @@ void task_numa_fault(int node, int pages, bool migrated) task_numa_placement(p); - p->numa_faults_buffer[node] += pages; + p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -4146,7 +4170,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) return false; if (dst_nid == p->numa_preferred_nid || - p->numa_faults[dst_nid] > p->numa_faults[src_nid]) + task_faults(p, dst_nid) > task_faults(p, src_nid)) return true; return false; @@ -4170,7 +4194,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) p->numa_migrate_seq >= sysctl_numa_balancing_settle_count) return false; - if (p->numa_faults[dst_nid] < p->numa_faults[src_nid]) + if (task_faults(p, dst_nid) < task_faults(p, src_nid)) return true; return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8677dbf31c2e..914216733e0a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1282,7 +1282,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); - int target_nid; + int target_nid, last_nid = -1; bool page_locked; bool migrated = false; @@ -1293,6 +1293,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); + last_nid = page_nid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); @@ -1361,7 +1362,7 @@ out: page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(last_nid, page_nid, HPAGE_PMD_NR, migrated); return 0; } diff --git a/mm/memory.c b/mm/memory.c index ed51f15136ee..24bc9b848af6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3536,6 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; spinlock_t *ptl; int page_nid = -1; + int last_nid; int target_nid; bool migrated = false; @@ -3566,6 +3567,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } BUG_ON(is_zero_pfn(page_to_pfn(page))); + last_nid = page_nid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); @@ -3581,7 +3583,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, out: if (page_nid != -1) - task_numa_fault(page_nid, 1, migrated); + task_numa_fault(last_nid, page_nid, 1, migrated); return 0; } @@ -3596,6 +3598,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; + int last_nid; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3643,6 +3646,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(page_mapcount(page) != 1)) continue; + last_nid = page_nid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); @@ -3655,7 +3659,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } if (page_nid != -1) - task_numa_fault(page_nid, 1, migrated); + task_numa_fault(last_nid, page_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit v1.2.3 From 1bc115d87dffd1c43bdc3c9c9d1e3a51c195d18e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:05 +0100 Subject: mm: numa: Scan pages with elevated page_mapcount Currently automatic NUMA balancing is unable to distinguish between false shared versus private pages except by ignoring pages with an elevated page_mapcount entirely. This avoids shared pages bouncing between the nodes whose task is using them but that is ignored quite a lot of data. This patch kicks away the training wheels in preparation for adding support for identifying shared/private pages is now in place. The ordering is so that the impact of the shared/private detection can be easily measured. Note that the patch does not migrate shared, file-backed within vmas marked VM_EXEC as these are generally shared library pages. Migrating such pages is not beneficial as there is an expectation they are read-shared between caches and iTLB and iCache pressure is generally low. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-28-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/migrate.h | 7 ++++--- mm/huge_memory.c | 12 +++++------- mm/memory.c | 7 ++----- mm/migrate.c | 17 ++++++----------- mm/mprotect.c | 4 +--- 5 files changed, 18 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8d3c57fdf221..f5096b58b20d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -90,11 +90,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING -extern int migrate_misplaced_page(struct page *page, int node); -extern int migrate_misplaced_page(struct page *page, int node); +extern int migrate_misplaced_page(struct page *page, + struct vm_area_struct *vma, int node); extern bool migrate_ratelimited(int node); #else -static inline int migrate_misplaced_page(struct page *page, int node) +static inline int migrate_misplaced_page(struct page *page, + struct vm_area_struct *vma, int node) { return -EAGAIN; /* can't migrate now */ } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 914216733e0a..2a28c2c6c165 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1484,14 +1484,12 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, struct page *page = pmd_page(*pmd); /* - * Only check non-shared pages. Do not trap faults - * against the zero page. The read-only data is likely - * to be read-cached on the local CPU cache and it is - * less useful to know about local vs remote hits on - * the zero page. + * Do not trap faults against the zero page. The + * read-only data is likely to be read-cached on the + * local CPU cache and it is less useful to know about + * local vs remote hits on the zero page. */ - if (page_mapcount(page) == 1 && - !is_huge_zero_page(page) && + if (!is_huge_zero_page(page) && !pmd_numa(*pmd)) { entry = pmdp_get_and_clear(mm, addr, pmd); entry = pmd_mknuma(entry); diff --git a/mm/memory.c b/mm/memory.c index 24bc9b848af6..3e3b4b8b6c41 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3577,7 +3577,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* Migrate to the requested node */ - migrated = migrate_misplaced_page(page, target_nid); + migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) page_nid = target_nid; @@ -3642,16 +3642,13 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = vm_normal_page(vma, addr, pteval); if (unlikely(!page)) continue; - /* only check non-shared pages */ - if (unlikely(page_mapcount(page) != 1)) - continue; last_nid = page_nid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); if (target_nid != -1) { - migrated = migrate_misplaced_page(page, target_nid); + migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) page_nid = target_nid; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 7bd90d3b16bb..fcba2f46bb80 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1599,7 +1599,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) * node. Caller is expected to have an elevated reference count on * the page that will be dropped by this function before returning. */ -int migrate_misplaced_page(struct page *page, int node) +int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, + int node) { pg_data_t *pgdat = NODE_DATA(node); int isolated; @@ -1607,10 +1608,11 @@ int migrate_misplaced_page(struct page *page, int node) LIST_HEAD(migratepages); /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer + * Don't migrate file pages that are mapped in multiple processes + * with execute permissions as they are probably shared libraries. */ - if (page_mapcount(page) != 1) + if (page_mapcount(page) != 1 && page_is_file_cache(page) && + (vma->vm_flags & VM_EXEC)) goto out; /* @@ -1660,13 +1662,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct mem_cgroup *memcg = NULL; int page_lru = page_is_file_cache(page); - /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer - */ - if (page_mapcount(page) != 1) - goto out_dropref; - /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and diff --git a/mm/mprotect.c b/mm/mprotect.c index 2da33dca6134..41e02923fcd9 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -69,9 +69,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (last_nid != this_nid) all_same_node = false; - /* only check non-shared pages */ - if (!pte_numa(oldpte) && - page_mapcount(page) == 1) { + if (!pte_numa(oldpte)) { ptent = pte_mknuma(ptent); updated = true; } -- cgit v1.2.3 From b795854b1fa70f6aee923ae5df74ff7afeaddcaa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:07 +0100 Subject: sched/numa: Set preferred NUMA node based on number of private faults Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. If treated identically there is a risk that shared pages bounce between nodes depending on the order they are referenced by tasks. Ultimately what is desirable is that task private pages remain local to the task while shared pages are interleaved between sharing tasks running on different nodes to give good average performance. This is further complicated by THP as even applications that partition their data may not be partitioning on a huge page boundary. To start with, this patch assumes that multi-threaded or multi-process applications partition their data and that in general the private accesses are more important for cpu->memory locality in the general case. Also, no new infrastructure is required to treat private pages properly but interleaving for shared pages requires additional infrastructure. To detect private accesses the pid of the last accessing task is required but the storage requirements are a high. This patch borrows heavily from Ingo Molnar's patch "numa, mm, sched: Implement last-CPU+PID hash tracking" to encode some bits from the last accessing task in the page flags as well as the node information. Collisions will occur but it is better than just depending on the node information. Node information is then used to determine if a page needs to migrate. The PID information is used to detect private/shared accesses. The preferred NUMA node is selected based on where the maximum number of approximately private faults were measured. Shared faults are not taken into consideration for a few reasons. First, if there are many tasks sharing the page then they'll all move towards the same node. The node will be compute overloaded and then scheduled away later only to bounce back again. Alternatively the shared tasks would just bounce around nodes because the fault information is effectively noise. Either way accounting for shared faults the same as private faults can result in lower performance overall. The second reason is based on a hypothetical workload that has a small number of very important, heavily accessed private pages but a large shared array. The shared array would dominate the number of faults and be selected as a preferred node even though it's the wrong decision. The third reason is that multiple threads in a process will race each other to fault the shared page making the fault information unreliable. Signed-off-by: Mel Gorman [ Fix complication error when !NUMA_BALANCING. ] Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-30-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 89 +++++++++++++++++++++++++++++---------- include/linux/mm_types.h | 4 +- include/linux/page-flags-layout.h | 28 +++++++----- kernel/sched/fair.c | 12 ++++-- mm/huge_memory.c | 8 ++-- mm/memory.c | 16 +++---- mm/mempolicy.c | 8 ++-- mm/migrate.c | 4 +- mm/mm_init.c | 18 ++++---- mm/mmzone.c | 14 +++--- mm/mprotect.c | 26 ++++++++---- mm/page_alloc.c | 4 +- 12 files changed, 149 insertions(+), 82 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b6e55ee8855..bb412ce2a8b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) +#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) +#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) +#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -661,48 +661,93 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -static inline int page_nid_xchg_last(struct page *page, int nid) +static inline int nid_pid_to_nidpid(int nid, int pid) { - return xchg(&page->_last_nid, nid); + return ((nid & LAST__NID_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } -static inline int page_nid_last(struct page *page) +static inline int nidpid_to_pid(int nidpid) { - return page->_last_nid; + return nidpid & LAST__PID_MASK; } -static inline void page_nid_reset_last(struct page *page) + +static inline int nidpid_to_nid(int nidpid) +{ + return (nidpid >> LAST__PID_SHIFT) & LAST__NID_MASK; +} + +static inline bool nidpid_pid_unset(int nidpid) +{ + return nidpid_to_pid(nidpid) == (-1 & LAST__PID_MASK); +} + +static inline bool nidpid_nid_unset(int nidpid) { - page->_last_nid = -1; + return nidpid_to_nid(nidpid) == (-1 & LAST__NID_MASK); +} + +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +static inline int page_nidpid_xchg_last(struct page *page, int nid) +{ + return xchg(&page->_last_nidpid, nid); +} + +static inline int page_nidpid_last(struct page *page) +{ + return page->_last_nidpid; +} +static inline void page_nidpid_reset_last(struct page *page) +{ + page->_last_nidpid = -1; } #else -static inline int page_nid_last(struct page *page) +static inline int page_nidpid_last(struct page *page) { - return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; + return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; } -extern int page_nid_xchg_last(struct page *page, int nid); +extern int page_nidpid_xchg_last(struct page *page, int nidpid); -static inline void page_nid_reset_last(struct page *page) +static inline void page_nidpid_reset_last(struct page *page) { - int nid = (1 << LAST_NID_SHIFT) - 1; + int nidpid = (1 << LAST_NIDPID_SHIFT) - 1; - page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + page->flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); + page->flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; } -#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ +#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */ #else -static inline int page_nid_xchg_last(struct page *page, int nid) +static inline int page_nidpid_xchg_last(struct page *page, int nidpid) { return page_to_nid(page); } -static inline int page_nid_last(struct page *page) +static inline int page_nidpid_last(struct page *page) { return page_to_nid(page); } -static inline void page_nid_reset_last(struct page *page) +static inline int nidpid_to_nid(int nidpid) +{ + return -1; +} + +static inline int nidpid_to_pid(int nidpid) +{ + return -1; +} + +static inline int nid_pid_to_nidpid(int nid, int pid) +{ + return -1; +} + +static inline bool nidpid_pid_unset(int nidpid) +{ + return 1; +} + +static inline void page_nidpid_reset_last(struct page *page) { } #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b7adf1d4310c..38a902a6d1e3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -174,8 +174,8 @@ struct page { void *shadow; #endif -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS - int _last_nid; +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS + int _last_nidpid; #endif } /* diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 93506a114034..02bc9184f16b 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -38,10 +38,10 @@ * The last is when there is insufficient space in page->flags and a separate * lookup is necessary. * - * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | - * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | - * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | + * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | + * " plus space for last_nidpid: | NODE | ZONE | LAST_NIDPID ... | FLAGS | + * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | + * " plus space for last_nidpid: | SECTION | NODE | ZONE | LAST_NIDPID ... | FLAGS | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | */ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -62,15 +62,21 @@ #endif #ifdef CONFIG_NUMA_BALANCING -#define LAST_NID_SHIFT NODES_SHIFT +#define LAST__PID_SHIFT 8 +#define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) + +#define LAST__NID_SHIFT NODES_SHIFT +#define LAST__NID_MASK ((1 << LAST__NID_SHIFT)-1) + +#define LAST_NIDPID_SHIFT (LAST__PID_SHIFT+LAST__NID_SHIFT) #else -#define LAST_NID_SHIFT 0 +#define LAST_NIDPID_SHIFT 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -#define LAST_NID_WIDTH LAST_NID_SHIFT +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NIDPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define LAST_NIDPID_WIDTH LAST_NIDPID_SHIFT #else -#define LAST_NID_WIDTH 0 +#define LAST_NIDPID_WIDTH 0 #endif /* @@ -81,8 +87,8 @@ #define NODE_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 -#define LAST_NID_NOT_IN_PAGE_FLAGS +#if defined(CONFIG_NUMA_BALANCING) && LAST_NIDPID_WIDTH == 0 +#define LAST_NIDPID_NOT_IN_PAGE_FLAGS #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 862d20d02e5c..b1de7c55e9f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -988,7 +988,7 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_nid, int node, int pages, bool migrated) +void task_numa_fault(int last_nidpid, int node, int pages, bool migrated) { struct task_struct *p = current; int priv; @@ -1000,8 +1000,14 @@ void task_numa_fault(int last_nid, int node, int pages, bool migrated) if (!p->mm) return; - /* For now, do not attempt to detect private/shared accesses */ - priv = 1; + /* + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed + */ + if (!nidpid_pid_unset(last_nidpid)) + priv = ((p->pid & LAST__PID_MASK) == nidpid_to_pid(last_nidpid)); + else + priv = 1; /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2a28c2c6c165..0baf0e4d5203 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1282,7 +1282,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); - int target_nid, last_nid = -1; + int target_nid, last_nidpid = -1; bool page_locked; bool migrated = false; @@ -1293,7 +1293,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); - last_nid = page_nid_last(page); + last_nidpid = page_nidpid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); @@ -1362,7 +1362,7 @@ out: page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_nid, page_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(last_nidpid, page_nid, HPAGE_PMD_NR, migrated); return 0; } @@ -1682,7 +1682,7 @@ static void __split_huge_page_refcount(struct page *page, page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_nid_xchg_last(page_tail, page_nid_last(page)); + page_nidpid_xchg_last(page_tail, page_nidpid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); diff --git a/mm/memory.c b/mm/memory.c index 3e3b4b8b6c41..cc7f20691c82 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,8 +69,8 @@ #include "internal.h" -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nidpid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -3536,7 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; spinlock_t *ptl; int page_nid = -1; - int last_nid; + int last_nidpid; int target_nid; bool migrated = false; @@ -3567,7 +3567,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } BUG_ON(is_zero_pfn(page_to_pfn(page))); - last_nid = page_nid_last(page); + last_nidpid = page_nidpid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); @@ -3583,7 +3583,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, out: if (page_nid != -1) - task_numa_fault(last_nid, page_nid, 1, migrated); + task_numa_fault(last_nidpid, page_nid, 1, migrated); return 0; } @@ -3598,7 +3598,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; - int last_nid; + int last_nidpid; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3643,7 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!page)) continue; - last_nid = page_nid_last(page); + last_nidpid = page_nidpid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); @@ -3656,7 +3656,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } if (page_nid != -1) - task_numa_fault(last_nid, page_nid, 1, migrated); + task_numa_fault(last_nidpid, page_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 04729647f359..aff1f1ed3dc5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2348,9 +2348,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nid; + int last_nidpid; + int this_nidpid; polnid = numa_node_id(); + this_nidpid = nid_pid_to_nidpid(polnid, current->pid); /* * Multi-stage node selection is used in conjunction @@ -2373,8 +2375,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nid = page_nid_xchg_last(page, polnid); - if (last_nid != polnid) + last_nidpid = page_nidpid_xchg_last(page, this_nidpid); + if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid) goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index fcba2f46bb80..025d1e3d2ad2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1498,7 +1498,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_nid_xchg_last(newpage, page_nid_last(page)); + page_nidpid_xchg_last(newpage, page_nidpid_last(page)); return newpage; } @@ -1675,7 +1675,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_nid_xchg_last(new_page, page_nid_last(page)); + page_nidpid_xchg_last(new_page, page_nidpid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 633c08863fd8..467de579784b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NIDPID_SHIFT; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastnid %d Flags %d\n", + "Section %d Node %d Zone %d Lastnidpid %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, - LAST_NID_WIDTH, + LAST_NIDPID_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastnid %d\n", + "Section %d Node %d Zone %d Lastnidpid %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_NID_SHIFT); + LAST_NIDPID_SHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastnid %lu\n", + "Section %lu Node %lu Zone %lu Lastnidpid %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_NID_PGSHIFT); + (unsigned long)LAST_NIDPID_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), @@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void) mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Node not in page flags"); #endif -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", - "Last nid not in page flags"); + "Last nidpid not in page flags"); #endif if (SECTIONS_WIDTH) { diff --git a/mm/mmzone.c b/mm/mmzone.c index 2ac0afbd68f3..25bb477deb26 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec) INIT_LIST_HEAD(&lruvec->lists[lru]); } -#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) -int page_nid_xchg_last(struct page *page, int nid) +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NIDPID_NOT_IN_PAGE_FLAGS) +int page_nidpid_xchg_last(struct page *page, int nidpid) { unsigned long old_flags, flags; - int last_nid; + int last_nidpid; do { old_flags = flags = page->flags; - last_nid = page_nid_last(page); + last_nidpid = page_nidpid_last(page); - flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); + flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); - return last_nid; + return last_nidpid; } #endif diff --git a/mm/mprotect.c b/mm/mprotect.c index 41e02923fcd9..f0b087d1069c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,14 +37,15 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa, bool *ret_all_same_node) + int dirty_accountable, int prot_numa, bool *ret_all_same_nidpid) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; - bool all_same_node = true; + bool all_same_nidpid = true; int last_nid = -1; + int last_pid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -63,11 +64,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, oldpte); if (page) { - int this_nid = page_to_nid(page); + int nidpid = page_nidpid_last(page); + int this_nid = nidpid_to_nid(nidpid); + int this_pid = nidpid_to_pid(nidpid); + if (last_nid == -1) last_nid = this_nid; - if (last_nid != this_nid) - all_same_node = false; + if (last_pid == -1) + last_pid = this_pid; + if (last_nid != this_nid || + last_pid != this_pid) { + all_same_nidpid = false; + } if (!pte_numa(oldpte)) { ptent = pte_mknuma(ptent); @@ -107,7 +115,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - *ret_all_same_node = all_same_node; + *ret_all_same_nidpid = all_same_nidpid; return pages; } @@ -134,7 +142,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd_t *pmd; unsigned long next; unsigned long pages = 0; - bool all_same_node; + bool all_same_nidpid; pmd = pmd_offset(pud, addr); do { @@ -158,7 +166,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_none_or_clear_bad(pmd)) continue; pages += change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa, &all_same_node); + dirty_accountable, prot_numa, &all_same_nidpid); /* * If we are changing protections for NUMA hinting faults then @@ -166,7 +174,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, * node. This allows a regular PMD to be handled as one fault * and effectively batches the taking of the PTL */ - if (prot_numa && all_same_node) + if (prot_numa && all_same_nidpid) change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd886fac451a..89bedd0e4cad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - page_nid_reset_last(page); + page_nidpid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); - page_nid_reset_last(page); + page_nidpid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for -- cgit v1.2.3 From 6fe6b2d6dabf392aceb3ad3a5e859b46a04465c6 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:08 +0100 Subject: sched/numa: Do not migrate memory immediately after switching node The load balancer can move tasks between nodes and does not take NUMA locality into account. With automatic NUMA balancing this may result in the tasks working set being migrated to the new node. However, as the fault buffer will still store faults from the old node the schduler may decide to reset the preferred node and migrate the task back resulting in more migrations. The ideal would be that the scheduler did not migrate tasks with a heavy memory footprint but this may result nodes being overloaded. We could also discard the fault information on task migration but this would still cause all the tasks working set to be migrated. This patch simply avoids migrating the memory for a short time after a task is migrated. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-31-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 18 ++++++++++++++++-- mm/mempolicy.c | 12 ++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 66b878e94554..9060a7f4e9ed 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1631,7 +1631,7 @@ static void __sched_fork(struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = 0; + p->numa_migrate_seq = 1; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_preferred_nid = -1; p->numa_work.next = &p->numa_work; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1de7c55e9f7..61ec0d4765b9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -884,7 +884,7 @@ static unsigned int task_scan_max(struct task_struct *p) * the preferred node but still allow the scheduler to move the task again if * the nodes CPUs are overloaded. */ -unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; +unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; static inline int task_faults_idx(int nid, int priv) { @@ -980,7 +980,7 @@ static void task_numa_placement(struct task_struct *p) /* Update the preferred nid and migrate task if possible */ p->numa_preferred_nid = max_nid; - p->numa_migrate_seq = 0; + p->numa_migrate_seq = 1; migrate_task_to(p, preferred_cpu); } } @@ -4121,6 +4121,20 @@ static void move_task(struct task_struct *p, struct lb_env *env) set_task_cpu(p, env->dst_cpu); activate_task(env->dst_rq, p, 0); check_preempt_curr(env->dst_rq, p, 0); +#ifdef CONFIG_NUMA_BALANCING + if (p->numa_preferred_nid != -1) { + int src_nid = cpu_to_node(env->src_cpu); + int dst_nid = cpu_to_node(env->dst_cpu); + + /* + * If the load balancer has moved the task then limit + * migrations from taking place in the short term in + * case this is a short-lived migration. + */ + if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid) + p->numa_migrate_seq = 0; + } +#endif } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aff1f1ed3dc5..196d8da2b657 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2378,6 +2378,18 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long last_nidpid = page_nidpid_xchg_last(page, this_nidpid); if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid) goto out; + +#ifdef CONFIG_NUMA_BALANCING + /* + * If the scheduler has just moved us away from our + * preferred node, do not bother migrating pages yet. + * This way a short and temporary process migration will + * not cause excessive memory migration. + */ + if (polnid != current->numa_preferred_nid && + !current->numa_migrate_seq) + goto out; +#endif } if (curnid != polnid) -- cgit v1.2.3 From fc3147245d193bd0f57307859c698fa28a20b0fe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:09 +0100 Subject: mm: numa: Limit NUMA scanning to migrate-on-fault VMAs There is a 90% regression observed with a large Oracle performance test on a 4 node system. Profiles indicated that the overhead was due to contention on sp_lock when looking up shared memory policies. These policies do not have the appropriate flags to allow them to be automatically balanced so trapping faults on them is pointless. This patch skips VMAs that do not have MPOL_F_MOF set. [riel@redhat.com: Initial patch] Signed-off-by: Mel Gorman Reported-and-tested-by: Joe Mario Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-32-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mempolicy.h | 1 + kernel/sched/fair.c | 2 +- mm/mempolicy.c | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index da6716b9e3fe..ea4d2495c646 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -136,6 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_vma_policy(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long addr); +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61ec0d4765b9..d98175d5c2c6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1130,7 +1130,7 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma)) + if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) continue; do { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 196d8da2b657..0e895a2eed5f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task, return pol; } +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +{ + struct mempolicy *pol = get_task_policy(task); + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } else if (vma->vm_policy) { + pol = vma->vm_policy; + } + } + + if (!pol) + return default_policy.flags & MPOL_F_MOF; + + return pol->flags & MPOL_F_MOF; +} + static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; -- cgit v1.2.3 From 25cbbef1924299249756bc4030fcb2436c019813 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:14 +0100 Subject: mm: numa: Trap pmd hinting faults only if we would otherwise trap PTE faults Base page PMD faulting is meant to batch handle NUMA hinting faults from PTEs. However, even is no PTE faults would ever be handled within a range the kernel still traps PMD hinting faults. This patch avoids the overhead. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-37-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/mprotect.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index f0b087d1069c..5aae39017d6d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -146,6 +146,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd = pmd_offset(pud, addr); do { + unsigned long this_pages; + next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) @@ -165,8 +167,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } if (pmd_none_or_clear_bad(pmd)) continue; - pages += change_pte_range(vma, pmd, addr, next, newprot, + this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa, &all_same_nidpid); + pages += this_pages; /* * If we are changing protections for NUMA hinting faults then @@ -174,7 +177,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, * node. This allows a regular PMD to be handled as one fault * and effectively batches the taking of the PTL */ - if (prot_numa && all_same_nidpid) + if (prot_numa && this_pages && all_same_nidpid) change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); -- cgit v1.2.3 From 90572890d202527c366aa9489b32404e88a7c020 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:20 +0100 Subject: mm: numa: Change page last {nid,pid} into {cpu,pid} Change the per page last fault tracking to use cpu,pid instead of nid,pid. This will allow us to try and lookup the alternate task more easily. Note that even though it is the cpu that is store in the page flags that the mpol_misplaced decision is still based on the node. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-43-git-send-email-mgorman@suse.de [ Fixed build failure on 32-bit systems. ] Signed-off-by: Ingo Molnar --- include/linux/mm.h | 90 ++++++++++++++++++++++----------------- include/linux/mm_types.h | 4 +- include/linux/page-flags-layout.h | 22 +++++----- kernel/bounds.c | 4 ++ kernel/sched/fair.c | 6 +-- mm/huge_memory.c | 8 ++-- mm/memory.c | 16 +++---- mm/mempolicy.c | 16 ++++--- mm/migrate.c | 4 +- mm/mm_init.c | 18 ++++---- mm/mmzone.c | 14 +++--- mm/mprotect.c | 28 ++++++------ mm/page_alloc.c | 4 +- 13 files changed, 125 insertions(+), 109 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index bb412ce2a8b5..ce464cd4777e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH) +#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0)) +#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1) +#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -661,96 +661,106 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -static inline int nid_pid_to_nidpid(int nid, int pid) +static inline int cpu_pid_to_cpupid(int cpu, int pid) { - return ((nid & LAST__NID_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); + return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } -static inline int nidpid_to_pid(int nidpid) +static inline int cpupid_to_pid(int cpupid) { - return nidpid & LAST__PID_MASK; + return cpupid & LAST__PID_MASK; } -static inline int nidpid_to_nid(int nidpid) +static inline int cpupid_to_cpu(int cpupid) { - return (nidpid >> LAST__PID_SHIFT) & LAST__NID_MASK; + return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } -static inline bool nidpid_pid_unset(int nidpid) +static inline int cpupid_to_nid(int cpupid) { - return nidpid_to_pid(nidpid) == (-1 & LAST__PID_MASK); + return cpu_to_node(cpupid_to_cpu(cpupid)); } -static inline bool nidpid_nid_unset(int nidpid) +static inline bool cpupid_pid_unset(int cpupid) { - return nidpid_to_nid(nidpid) == (-1 & LAST__NID_MASK); + return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS -static inline int page_nidpid_xchg_last(struct page *page, int nid) +static inline bool cpupid_cpu_unset(int cpupid) { - return xchg(&page->_last_nidpid, nid); + return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } -static inline int page_nidpid_last(struct page *page) +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { - return page->_last_nidpid; + return xchg(&page->_last_cpupid, cpupid); } -static inline void page_nidpid_reset_last(struct page *page) + +static inline int page_cpupid_last(struct page *page) +{ + return page->_last_cpupid; +} +static inline void page_cpupid_reset_last(struct page *page) { - page->_last_nidpid = -1; + page->_last_cpupid = -1; } #else -static inline int page_nidpid_last(struct page *page) +static inline int page_cpupid_last(struct page *page) { - return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; + return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } -extern int page_nidpid_xchg_last(struct page *page, int nidpid); +extern int page_cpupid_xchg_last(struct page *page, int cpupid); -static inline void page_nidpid_reset_last(struct page *page) +static inline void page_cpupid_reset_last(struct page *page) { - int nidpid = (1 << LAST_NIDPID_SHIFT) - 1; + int cpupid = (1 << LAST_CPUPID_SHIFT) - 1; - page->flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); - page->flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; + page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; } -#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */ -#else -static inline int page_nidpid_xchg_last(struct page *page, int nidpid) +#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ +#else /* !CONFIG_NUMA_BALANCING */ +static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { - return page_to_nid(page); + return page_to_nid(page); /* XXX */ } -static inline int page_nidpid_last(struct page *page) +static inline int page_cpupid_last(struct page *page) { - return page_to_nid(page); + return page_to_nid(page); /* XXX */ } -static inline int nidpid_to_nid(int nidpid) +static inline int cpupid_to_nid(int cpupid) { return -1; } -static inline int nidpid_to_pid(int nidpid) +static inline int cpupid_to_pid(int cpupid) { return -1; } -static inline int nid_pid_to_nidpid(int nid, int pid) +static inline int cpupid_to_cpu(int cpupid) { return -1; } -static inline bool nidpid_pid_unset(int nidpid) +static inline int cpu_pid_to_cpupid(int nid, int pid) +{ + return -1; +} + +static inline bool cpupid_pid_unset(int cpupid) { return 1; } -static inline void page_nidpid_reset_last(struct page *page) +static inline void page_cpupid_reset_last(struct page *page) { } -#endif +#endif /* CONFIG_NUMA_BALANCING */ static inline struct zone *page_zone(const struct page *page) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 38a902a6d1e3..a30f9ca66557 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -174,8 +174,8 @@ struct page { void *shadow; #endif -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS - int _last_nidpid; +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS + int _last_cpupid; #endif } /* diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 02bc9184f16b..da523661500a 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -39,9 +39,9 @@ * lookup is necessary. * * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * " plus space for last_nidpid: | NODE | ZONE | LAST_NIDPID ... | FLAGS | + * " plus space for last_cpupid: | NODE | ZONE | LAST_CPUPID ... | FLAGS | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | - * " plus space for last_nidpid: | SECTION | NODE | ZONE | LAST_NIDPID ... | FLAGS | + * " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | */ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -65,18 +65,18 @@ #define LAST__PID_SHIFT 8 #define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) -#define LAST__NID_SHIFT NODES_SHIFT -#define LAST__NID_MASK ((1 << LAST__NID_SHIFT)-1) +#define LAST__CPU_SHIFT NR_CPUS_BITS +#define LAST__CPU_MASK ((1 << LAST__CPU_SHIFT)-1) -#define LAST_NIDPID_SHIFT (LAST__PID_SHIFT+LAST__NID_SHIFT) +#define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT) #else -#define LAST_NIDPID_SHIFT 0 +#define LAST_CPUPID_SHIFT 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NIDPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -#define LAST_NIDPID_WIDTH LAST_NIDPID_SHIFT +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else -#define LAST_NIDPID_WIDTH 0 +#define LAST_CPUPID_WIDTH 0 #endif /* @@ -87,8 +87,8 @@ #define NODE_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_NIDPID_WIDTH == 0 -#define LAST_NIDPID_NOT_IN_PAGE_FLAGS +#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0 +#define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862292b2..e8ca97b5c386 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -10,6 +10,7 @@ #include #include #include +#include void foo(void) { @@ -17,5 +18,8 @@ void foo(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); +#ifdef CONFIG_SMP + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); +#endif /* End of constants */ } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa561c8dc899..dbe0f628efa3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1210,7 +1210,7 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_nidpid, int node, int pages, bool migrated) +void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) { struct task_struct *p = current; int priv; @@ -1226,8 +1226,8 @@ void task_numa_fault(int last_nidpid, int node, int pages, bool migrated) * First accesses are treated as private, otherwise consider accesses * to be private if the accessing pid has not changed */ - if (!nidpid_pid_unset(last_nidpid)) - priv = ((p->pid & LAST__PID_MASK) == nidpid_to_pid(last_nidpid)); + if (!cpupid_pid_unset(last_cpupid)) + priv = ((p->pid & LAST__PID_MASK) == cpupid_to_pid(last_cpupid)); else priv = 1; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0baf0e4d5203..becf92ca54f3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1282,7 +1282,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); - int target_nid, last_nidpid = -1; + int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; @@ -1293,7 +1293,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); - last_nidpid = page_nidpid_last(page); + last_cpupid = page_cpupid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); @@ -1362,7 +1362,7 @@ out: page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_nidpid, page_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, migrated); return 0; } @@ -1682,7 +1682,7 @@ static void __split_huge_page_refcount(struct page *page, page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_nidpid_xchg_last(page_tail, page_nidpid_last(page)); + page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); diff --git a/mm/memory.c b/mm/memory.c index cc7f20691c82..5162e6d0d652 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,8 +69,8 @@ #include "internal.h" -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS -#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nidpid. +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -3536,7 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; spinlock_t *ptl; int page_nid = -1; - int last_nidpid; + int last_cpupid; int target_nid; bool migrated = false; @@ -3567,7 +3567,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } BUG_ON(is_zero_pfn(page_to_pfn(page))); - last_nidpid = page_nidpid_last(page); + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); @@ -3583,7 +3583,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, out: if (page_nid != -1) - task_numa_fault(last_nidpid, page_nid, 1, migrated); + task_numa_fault(last_cpupid, page_nid, 1, migrated); return 0; } @@ -3598,7 +3598,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; - int last_nidpid; + int last_cpupid; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3643,7 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!page)) continue; - last_nidpid = page_nidpid_last(page); + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); @@ -3656,7 +3656,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } if (page_nid != -1) - task_numa_fault(last_nidpid, page_nid, 1, migrated); + task_numa_fault(last_cpupid, page_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e895a2eed5f..a5867ef24bda 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2324,6 +2324,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long struct zone *zone; int curnid = page_to_nid(page); unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); int polnid = -1; int ret = -1; @@ -2372,11 +2374,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nidpid; - int this_nidpid; + int last_cpupid; + int this_cpupid; - polnid = numa_node_id(); - this_nidpid = nid_pid_to_nidpid(polnid, current->pid); + polnid = thisnid; + this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); /* * Multi-stage node selection is used in conjunction @@ -2399,8 +2401,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nidpid = page_nidpid_xchg_last(page, this_nidpid); - if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid) + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) goto out; #ifdef CONFIG_NUMA_BALANCING @@ -2410,7 +2412,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * This way a short and temporary process migration will * not cause excessive memory migration. */ - if (polnid != current->numa_preferred_nid && + if (thisnid != current->numa_preferred_nid && !current->numa_migrate_seq) goto out; #endif diff --git a/mm/migrate.c b/mm/migrate.c index 025d1e3d2ad2..ff537749d3b4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1498,7 +1498,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_nidpid_xchg_last(newpage, page_nidpid_last(page)); + page_cpupid_xchg_last(newpage, page_cpupid_last(page)); return newpage; } @@ -1675,7 +1675,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_nidpid_xchg_last(new_page, page_nidpid_last(page)); + page_cpupid_xchg_last(new_page, page_cpupid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 467de579784b..68562e92d50c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NIDPID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastnidpid %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, - LAST_NIDPID_WIDTH, + LAST_CPUPID_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastnidpid %d\n", + "Section %d Node %d Zone %d Lastcpupid %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_NIDPID_SHIFT); + LAST_CPUPID_SHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastnidpid %lu\n", + "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_NIDPID_PGSHIFT); + (unsigned long)LAST_CPUPID_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), @@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void) mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Node not in page flags"); #endif -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", - "Last nidpid not in page flags"); + "Last cpupid not in page flags"); #endif if (SECTIONS_WIDTH) { diff --git a/mm/mmzone.c b/mm/mmzone.c index 25bb477deb26..bf34fb8556db 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec) INIT_LIST_HEAD(&lruvec->lists[lru]); } -#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NIDPID_NOT_IN_PAGE_FLAGS) -int page_nidpid_xchg_last(struct page *page, int nidpid) +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +int page_cpupid_xchg_last(struct page *page, int cpupid) { unsigned long old_flags, flags; - int last_nidpid; + int last_cpupid; do { old_flags = flags = page->flags; - last_nidpid = page_nidpid_last(page); + last_cpupid = page_cpupid_last(page); - flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); - flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; + flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); - return last_nidpid; + return last_cpupid; } #endif diff --git a/mm/mprotect.c b/mm/mprotect.c index 5aae39017d6d..9a74855f1241 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,14 +37,14 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa, bool *ret_all_same_nidpid) + int dirty_accountable, int prot_numa, bool *ret_all_same_cpupid) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; - bool all_same_nidpid = true; - int last_nid = -1; + bool all_same_cpupid = true; + int last_cpu = -1; int last_pid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -64,17 +64,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, oldpte); if (page) { - int nidpid = page_nidpid_last(page); - int this_nid = nidpid_to_nid(nidpid); - int this_pid = nidpid_to_pid(nidpid); + int cpupid = page_cpupid_last(page); + int this_cpu = cpupid_to_cpu(cpupid); + int this_pid = cpupid_to_pid(cpupid); - if (last_nid == -1) - last_nid = this_nid; + if (last_cpu == -1) + last_cpu = this_cpu; if (last_pid == -1) last_pid = this_pid; - if (last_nid != this_nid || + if (last_cpu != this_cpu || last_pid != this_pid) { - all_same_nidpid = false; + all_same_cpupid = false; } if (!pte_numa(oldpte)) { @@ -115,7 +115,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - *ret_all_same_nidpid = all_same_nidpid; + *ret_all_same_cpupid = all_same_cpupid; return pages; } @@ -142,7 +142,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd_t *pmd; unsigned long next; unsigned long pages = 0; - bool all_same_nidpid; + bool all_same_cpupid; pmd = pmd_offset(pud, addr); do { @@ -168,7 +168,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_none_or_clear_bad(pmd)) continue; this_pages = change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa, &all_same_nidpid); + dirty_accountable, prot_numa, &all_same_cpupid); pages += this_pages; /* @@ -177,7 +177,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, * node. This allows a regular PMD to be handled as one fault * and effectively batches the taking of the PTL */ - if (prot_numa && this_pages && all_same_nidpid) + if (prot_numa && this_pages && all_same_cpupid) change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89bedd0e4cad..73d812f16dde 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - page_nidpid_reset_last(page); + page_cpupid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); - page_nidpid_reset_last(page); + page_cpupid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for -- cgit v1.2.3 From 8c8a743c5087bac9caac8155b8f3b367e75cdd0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:21 +0100 Subject: sched/numa: Use {cpu, pid} to create task groups for shared faults While parallel applications tend to align their data on the cache boundary, they tend not to align on the page or THP boundary. Consequently tasks that partition their data can still "false-share" pages presenting a problem for optimal NUMA placement. This patch uses NUMA hinting faults to chain tasks together into numa_groups. As well as storing the NID a task was running on when accessing a page a truncated representation of the faulting PID is stored. If subsequent faults are from different PIDs it is reasonable to assume that those two tasks share a page and are candidates for being grouped together. Note that this patch makes no scheduling decisions based on the grouping information. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-44-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 11 ++++ include/linux/sched.h | 3 + kernel/sched/core.c | 3 + kernel/sched/fair.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 5 +- mm/memory.c | 8 +++ 6 files changed, 182 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index ce464cd4777e..81443d557a2e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -691,6 +691,12 @@ static inline bool cpupid_cpu_unset(int cpupid) return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } +static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) +{ + return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); +} + +#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { @@ -760,6 +766,11 @@ static inline bool cpupid_pid_unset(int cpupid) static inline void page_cpupid_reset_last(struct page *page) { } + +static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ static inline struct zone *page_zone(const struct page *page) diff --git a/include/linux/sched.h b/include/linux/sched.h index b6619792bb13..f587ded5c148 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1347,6 +1347,9 @@ struct task_struct { u64 node_stamp; /* migration stamp */ struct callback_head numa_work; + struct list_head numa_entry; + struct numa_group *numa_group; + /* * Exponential decaying average of faults on a per-node basis. * Scheduling placement decisions are made based on the these counts. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1fe59da280e3..51092d5cc64c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1733,6 +1733,9 @@ static void __sched_fork(struct task_struct *p) p->numa_work.next = &p->numa_work; p->numa_faults = NULL; p->numa_faults_buffer = NULL; + + INIT_LIST_HEAD(&p->numa_entry); + p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dbe0f628efa3..85565053a6ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -888,6 +888,17 @@ static unsigned int task_scan_max(struct task_struct *p) */ unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + struct list_head task_list; + + struct rcu_head rcu; + atomic_long_t faults[0]; +}; + static inline int task_faults_idx(int nid, int priv) { return 2 * nid + priv; @@ -1182,7 +1193,10 @@ static void task_numa_placement(struct task_struct *p) int priv, i; for (priv = 0; priv < 2; priv++) { + long diff; + i = task_faults_idx(nid, priv); + diff = -p->numa_faults[i]; /* Decay existing window, copy faults since last scan */ p->numa_faults[i] >>= 1; @@ -1190,6 +1204,11 @@ static void task_numa_placement(struct task_struct *p) p->numa_faults_buffer[i] = 0; faults += p->numa_faults[i]; + diff += p->numa_faults[i]; + if (p->numa_group) { + /* safe because we can only change our own group */ + atomic_long_add(diff, &p->numa_group->faults[i]); + } } if (faults > max_faults) { @@ -1207,6 +1226,131 @@ static void task_numa_placement(struct task_struct *p) } } +static inline int get_numa_group(struct numa_group *grp) +{ + return atomic_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ + if (atomic_dec_and_test(&grp->refcount)) + kfree_rcu(grp, rcu); +} + +static void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static void task_numa_group(struct task_struct *p, int cpupid) +{ + struct numa_group *grp, *my_grp; + struct task_struct *tsk; + bool join = false; + int cpu = cpupid_to_cpu(cpupid); + int i; + + if (unlikely(!p->numa_group)) { + unsigned int size = sizeof(struct numa_group) + + 2*nr_node_ids*sizeof(atomic_long_t); + + grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!grp) + return; + + atomic_set(&grp->refcount, 1); + spin_lock_init(&grp->lock); + INIT_LIST_HEAD(&grp->task_list); + + for (i = 0; i < 2*nr_node_ids; i++) + atomic_long_set(&grp->faults[i], p->numa_faults[i]); + + list_add(&p->numa_entry, &grp->task_list); + grp->nr_tasks++; + rcu_assign_pointer(p->numa_group, grp); + } + + rcu_read_lock(); + tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + + if (!cpupid_match_pid(tsk, cpupid)) + goto unlock; + + grp = rcu_dereference(tsk->numa_group); + if (!grp) + goto unlock; + + my_grp = p->numa_group; + if (grp == my_grp) + goto unlock; + + /* + * Only join the other group if its bigger; if we're the bigger group, + * the other task will join us. + */ + if (my_grp->nr_tasks > grp->nr_tasks) + goto unlock; + + /* + * Tie-break on the grp address. + */ + if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) + goto unlock; + + if (!get_numa_group(grp)) + goto unlock; + + join = true; + +unlock: + rcu_read_unlock(); + + if (!join) + return; + + for (i = 0; i < 2*nr_node_ids; i++) { + atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]); + atomic_long_add(p->numa_faults[i], &grp->faults[i]); + } + + double_lock(&my_grp->lock, &grp->lock); + + list_move(&p->numa_entry, &grp->task_list); + my_grp->nr_tasks--; + grp->nr_tasks++; + + spin_unlock(&my_grp->lock); + spin_unlock(&grp->lock); + + rcu_assign_pointer(p->numa_group, grp); + + put_numa_group(my_grp); +} + +void task_numa_free(struct task_struct *p) +{ + struct numa_group *grp = p->numa_group; + int i; + + if (grp) { + for (i = 0; i < 2*nr_node_ids; i++) + atomic_long_sub(p->numa_faults[i], &grp->faults[i]); + + spin_lock(&grp->lock); + list_del(&p->numa_entry); + grp->nr_tasks--; + spin_unlock(&grp->lock); + rcu_assign_pointer(p->numa_group, NULL); + put_numa_group(grp); + } + + kfree(p->numa_faults); +} + /* * Got a PROT_NONE fault for a page on @node. */ @@ -1222,15 +1366,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) if (!p->mm) return; - /* - * First accesses are treated as private, otherwise consider accesses - * to be private if the accessing pid has not changed - */ - if (!cpupid_pid_unset(last_cpupid)) - priv = ((p->pid & LAST__PID_MASK) == cpupid_to_pid(last_cpupid)); - else - priv = 1; - /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; @@ -1244,6 +1379,18 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); } + /* + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed + */ + if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { + priv = 1; + } else { + priv = cpupid_match_pid(p, last_cpupid); + if (!priv) + task_numa_group(p, last_cpupid); + } + /* * If pages are properly placed (did not migrate) then scan slower. * This is reset periodically in case of phase changes diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 691e96964dcc..8037b10a256f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -559,10 +559,7 @@ static inline u64 rq_clock_task(struct rq *rq) #ifdef CONFIG_NUMA_BALANCING extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); -static inline void task_numa_free(struct task_struct *p) -{ - kfree(p->numa_faults); -} +extern void task_numa_free(struct task_struct *p); #else /* CONFIG_NUMA_BALANCING */ static inline void task_numa_free(struct task_struct *p) { diff --git a/mm/memory.c b/mm/memory.c index 5162e6d0d652..c57efa25cdbb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2719,6 +2719,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); reuse: + /* + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + if (old_page) + page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); + flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); -- cgit v1.2.3 From 7851a45cd3f6198bf542c30e27b330e8eeb3736c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:23 +0100 Subject: mm: numa: Copy cpupid on page migration After page migration, the new page has the nidpid unset. This makes every fault on a recently migrated page look like a first numa fault, leading to another page migration. Copying over the nidpid at page migration time should prevent erroneous migrations of recently migrated pages. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-46-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/migrate.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index ff537749d3b4..44c1fa9d6f54 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -443,6 +443,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, */ void migrate_page_copy(struct page *newpage, struct page *page) { + int cpupid; + if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); else @@ -479,6 +481,13 @@ void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + /* + * Copy NUMA information to the new page, to prevent over-eager + * future migrations of this same page. + */ + cpupid = page_cpupid_xchg_last(page, -1); + page_cpupid_xchg_last(newpage, cpupid); + mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); /* -- cgit v1.2.3 From 6688cc05473b36a0a3d3971e1adf1712919b32eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:24 +0100 Subject: mm: numa: Do not group on RO pages And here's a little something to make sure not the whole world ends up in a single group. As while we don't migrate shared executable pages, we do scan/fault on them. And since everybody links to libc, everybody ends up in the same group. Suggested-by: Rik van Riel Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-47-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +++++-- kernel/sched/fair.c | 5 +++-- mm/huge_memory.c | 15 +++++++++++++-- mm/memory.c | 30 ++++++++++++++++++++++++++---- 4 files changed, 47 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/linux/sched.h b/include/linux/sched.h index b0b343b1ba64..ff543851a18a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1450,13 +1450,16 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#define TNF_MIGRATED 0x01 +#define TNF_NO_GROUP 0x02 + #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int last_node, int node, int pages, bool migrated); +extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); #else static inline void task_numa_fault(int last_node, int node, int pages, - bool migrated) + int flags) { } static inline pid_t task_numa_group_id(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5bd309c035c7..35661b8afb4e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1361,9 +1361,10 @@ void task_numa_free(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) +void task_numa_fault(int last_cpupid, int node, int pages, int flags) { struct task_struct *p = current; + bool migrated = flags & TNF_MIGRATED; int priv; if (!numabalancing_enabled) @@ -1394,7 +1395,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) priv = 1; } else { priv = cpupid_match_pid(p, last_cpupid); - if (!priv) + if (!priv && !(flags & TNF_NO_GROUP)) task_numa_group(p, last_cpupid); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index becf92ca54f3..7ab4e32afe12 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1285,6 +1285,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; + int flags = 0; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1298,6 +1299,14 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pmd_write(pmd)) + flags |= TNF_NO_GROUP; + /* * Acquire the page lock to serialise THP migrations but avoid dropping * page_table_lock if at all possible @@ -1343,8 +1352,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); - if (migrated) + if (migrated) { + flags |= TNF_MIGRATED; page_nid = target_nid; + } goto out; clear_pmdnuma: @@ -1362,7 +1373,7 @@ out: page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); return 0; } diff --git a/mm/memory.c b/mm/memory.c index c57efa25cdbb..eba846bcf124 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3547,6 +3547,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int last_cpupid; int target_nid; bool migrated = false; + int flags = 0; /* * The "pte" at this point cannot be used safely without @@ -3575,6 +3576,14 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } BUG_ON(is_zero_pfn(page_to_pfn(page))); + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pte_write(pte)) + flags |= TNF_NO_GROUP; + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); @@ -3586,12 +3595,14 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate to the requested node */ migrated = migrate_misplaced_page(page, vma, target_nid); - if (migrated) + if (migrated) { page_nid = target_nid; + flags |= TNF_MIGRATED; + } out: if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, 1, migrated); + task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; } @@ -3632,6 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int page_nid = -1; int target_nid; bool migrated = false; + int flags = 0; if (!pte_present(pteval)) continue; @@ -3651,20 +3663,30 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!page)) continue; + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pte_write(pteval)) + flags |= TNF_NO_GROUP; + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); if (target_nid != -1) { migrated = migrate_misplaced_page(page, vma, target_nid); - if (migrated) + if (migrated) { page_nid = target_nid; + flags |= TNF_MIGRATED; + } } else { put_page(page); } if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, 1, migrated); + task_numa_fault(last_cpupid, page_nid, 1, flags); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit v1.2.3 From 0f19c17929c952c6f0966d93ab05558e7bf814cc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:25 +0100 Subject: mm: numa: Do not batch handle PMD pages With the THP migration races closed it is still possible to occasionally see corruption. The problem is related to handling PMD pages in batch. When a page fault is handled it can be assumed that the page being faulted will also be flushed from the TLB. The same flushing does not happen when handling PMD pages in batch. Fixing is straight forward but there are a number of reasons not to 1. Multiple TLB flushes may have to be sent depending on what pages get migrated 2. The handling of PMDs in batch means that faults get accounted to the task that is handling the fault. While care is taken to only mark PMDs where the last CPU and PID match it can still have problems due to PID truncation when matching PIDs. 3. Batching on the PMD level may reduce faults but setting pmd_numa requires taking a heavy lock that can contend with THP migration and handling the fault requires the release/acquisition of the PTL for every page migrated. It's still pretty heavy. PMD batch handling is not something that people ever have been happy with. This patch removes it and later patches will deal with the additional fault overhead using more installigent migrate rate adaption. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-48-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/memory.c | 101 ++-------------------------------------------------------- mm/mprotect.c | 47 ++------------------------- 2 files changed, 4 insertions(+), 144 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index eba846bcf124..9898eeb9a21c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3606,103 +3606,6 @@ out: return 0; } -/* NUMA hinting page fault entry point for regular pmds */ -#ifdef CONFIG_NUMA_BALANCING -static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t pmd; - pte_t *pte, *orig_pte; - unsigned long _addr = addr & PMD_MASK; - unsigned long offset; - spinlock_t *ptl; - bool numa = false; - int last_cpupid; - - spin_lock(&mm->page_table_lock); - pmd = *pmdp; - if (pmd_numa(pmd)) { - set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); - numa = true; - } - spin_unlock(&mm->page_table_lock); - - if (!numa) - return 0; - - /* we're in a page fault so some vma must be in the range */ - BUG_ON(!vma); - BUG_ON(vma->vm_start >= _addr + PMD_SIZE); - offset = max(_addr, vma->vm_start) & ~PMD_MASK; - VM_BUG_ON(offset >= PMD_SIZE); - orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); - pte += offset >> PAGE_SHIFT; - for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { - pte_t pteval = *pte; - struct page *page; - int page_nid = -1; - int target_nid; - bool migrated = false; - int flags = 0; - - if (!pte_present(pteval)) - continue; - if (!pte_numa(pteval)) - continue; - if (addr >= vma->vm_end) { - vma = find_vma(mm, addr); - /* there's a pte present so there must be a vma */ - BUG_ON(!vma); - BUG_ON(addr < vma->vm_start); - } - if (pte_numa(pteval)) { - pteval = pte_mknonnuma(pteval); - set_pte_at(mm, addr, pte, pteval); - } - page = vm_normal_page(vma, addr, pteval); - if (unlikely(!page)) - continue; - - /* - * Avoid grouping on DSO/COW pages in specific and RO pages - * in general, RO pages shouldn't hurt as much anyway since - * they can be in shared cache state. - */ - if (!pte_write(pteval)) - flags |= TNF_NO_GROUP; - - last_cpupid = page_cpupid_last(page); - page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, page_nid); - pte_unmap_unlock(pte, ptl); - if (target_nid != -1) { - migrated = migrate_misplaced_page(page, vma, target_nid); - if (migrated) { - page_nid = target_nid; - flags |= TNF_MIGRATED; - } - } else { - put_page(page); - } - - if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, 1, flags); - - pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); - } - pte_unmap_unlock(orig_pte, ptl); - - return 0; -} -#else -static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - BUG(); - return 0; -} -#endif /* CONFIG_NUMA_BALANCING */ - /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3841,8 +3744,8 @@ retry: } } - if (pmd_numa(*pmd)) - return do_pmd_numa_page(mm, vma, address, pmd); + /* THP should already have been handled */ + BUG_ON(pmd_numa(*pmd)); /* * Use __pte_alloc instead of pte_alloc_map, because we can't diff --git a/mm/mprotect.c b/mm/mprotect.c index 9a74855f1241..a0302ac0be98 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,15 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa, bool *ret_all_same_cpupid) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; - bool all_same_cpupid = true; - int last_cpu = -1; - int last_pid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -64,19 +61,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, oldpte); if (page) { - int cpupid = page_cpupid_last(page); - int this_cpu = cpupid_to_cpu(cpupid); - int this_pid = cpupid_to_pid(cpupid); - - if (last_cpu == -1) - last_cpu = this_cpu; - if (last_pid == -1) - last_pid = this_pid; - if (last_cpu != this_cpu || - last_pid != this_pid) { - all_same_cpupid = false; - } - if (!pte_numa(oldpte)) { ptent = pte_mknuma(ptent); updated = true; @@ -115,26 +99,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - *ret_all_same_cpupid = all_same_cpupid; return pages; } -#ifdef CONFIG_NUMA_BALANCING -static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) -{ - spin_lock(&mm->page_table_lock); - set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); - spin_unlock(&mm->page_table_lock); -} -#else -static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) -{ - BUG(); -} -#endif /* CONFIG_NUMA_BALANCING */ - static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -142,7 +109,6 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd_t *pmd; unsigned long next; unsigned long pages = 0; - bool all_same_cpupid; pmd = pmd_offset(pud, addr); do { @@ -168,17 +134,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_none_or_clear_bad(pmd)) continue; this_pages = change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa, &all_same_cpupid); + dirty_accountable, prot_numa); pages += this_pages; - - /* - * If we are changing protections for NUMA hinting faults then - * set pmd_numa if the examined pages were all on the same - * node. This allows a regular PMD to be handled as one fault - * and effectively batches the taking of the PTL - */ - if (prot_numa && this_pages && all_same_cpupid) - change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); return pages; -- cgit v1.2.3 From dabe1d992414a6456e60e41f1d1ad8affc6d444d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:34 +0100 Subject: sched/numa: Be more careful about joining numa groups Due to the way the pid is truncated, and tasks are moved between CPUs by the scheduler, it is possible for the current task_numa_fault to group together tasks that do not actually share memory together. This patch adds a few easy sanity checks to task_numa_fault, joining tasks together if they share the same tsk->mm, or if the fault was on a page with an elevated mapcount, in a shared VMA. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-57-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/fair.c | 16 +++++++++++----- mm/memory.c | 7 +++++++ 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1127a46ac3d2..59f953b2e413 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1454,6 +1454,7 @@ struct task_struct { #define TNF_MIGRATED 0x01 #define TNF_NO_GROUP 0x02 +#define TNF_SHARED 0x04 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5166b9b1af70..222c2d0b6ae2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1381,7 +1381,7 @@ static void double_lock(spinlock_t *l1, spinlock_t *l2) spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } -static void task_numa_group(struct task_struct *p, int cpupid) +static void task_numa_group(struct task_struct *p, int cpupid, int flags) { struct numa_group *grp, *my_grp; struct task_struct *tsk; @@ -1439,10 +1439,16 @@ static void task_numa_group(struct task_struct *p, int cpupid) if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) goto unlock; - if (!get_numa_group(grp)) - goto unlock; + /* Always join threads in the same process. */ + if (tsk->mm == current->mm) + join = true; + + /* Simple filter to avoid false positives due to PID collisions */ + if (flags & TNF_SHARED) + join = true; - join = true; + if (join && !get_numa_group(grp)) + join = false; unlock: rcu_read_unlock(); @@ -1539,7 +1545,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) } else { priv = cpupid_match_pid(p, last_cpupid); if (!priv && !(flags & TNF_NO_GROUP)) - task_numa_group(p, last_cpupid); + task_numa_group(p, last_cpupid, flags); } /* diff --git a/mm/memory.c b/mm/memory.c index 9898eeb9a21c..823720c43ea9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3584,6 +3584,13 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_write(pte)) flags |= TNF_NO_GROUP; + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); -- cgit v1.2.3 From 04bb2f9475054298f0c67a89ca92cade42d3fe5e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:36 +0100 Subject: sched/numa: Adjust scan rate in task_numa_placement Adjust numa_scan_period in task_numa_placement, depending on how much useful work the numa code can do. The more local faults there are in a given scan window the longer the period (and hence the slower the scan rate) during the next window. If there are excessive shared faults then the scan period will decrease with the amount of scaling depending on whether the ratio of shared/private faults. If the preferred node changes then the scan rate is reset to recheck if the task is properly placed. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-59-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 ++++ kernel/sched/fair.c | 112 +++++++++++++++++++++++++++++++++++++++----------- mm/huge_memory.c | 4 +- mm/memory.c | 9 ++-- 4 files changed, 105 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/include/linux/sched.h b/include/linux/sched.h index 59f953b2e413..2292f6c1596f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1365,6 +1365,14 @@ struct task_struct { */ unsigned long *numa_faults_buffer; + /* + * numa_faults_locality tracks if faults recorded during the last + * scan window were remote/local. The task scan period is adapted + * based on the locality of the faults with different weights + * depending on whether they were shared or private faults + */ + unsigned long numa_faults_locality[2]; + int numa_preferred_nid; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1455,6 +1463,7 @@ struct task_struct { #define TNF_MIGRATED 0x01 #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 +#define TNF_FAULT_LOCAL 0x08 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d26a16e45437..66237ff8b01e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p) sched_setnuma(p, env.dst_nid); + /* + * Reset the scan period if the task is being rescheduled on an + * alternative node to recheck if the tasks is now properly placed. + */ + p->numa_scan_period = task_scan_min(p); + if (env.best_task == NULL) { int ret = migrate_task_to(p, env.best_cpu); return ret; @@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p) p->numa_migrate_retry = jiffies + HZ*5; } +/* + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS + * increments. The more local the fault statistics are, the higher the scan + * period will be for the next scan window. If local/remote ratio is below + * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the + * scan period will decrease + */ +#define NUMA_PERIOD_SLOTS 10 +#define NUMA_PERIOD_THRESHOLD 3 + +/* + * Increase the scan period (slow down scanning) if the majority of + * our memory is already on our local node, or if the majority of + * the page accesses are shared with other processes. + * Otherwise, decrease the scan period. + */ +static void update_task_scan_period(struct task_struct *p, + unsigned long shared, unsigned long private) +{ + unsigned int period_slot; + int ratio; + int diff; + + unsigned long remote = p->numa_faults_locality[0]; + unsigned long local = p->numa_faults_locality[1]; + + /* + * If there were no record hinting faults then either the task is + * completely idle or all activity is areas that are not of interest + * to automatic numa balancing. Scan slower + */ + if (local + shared == 0) { + p->numa_scan_period = min(p->numa_scan_period_max, + p->numa_scan_period << 1); + + p->mm->numa_next_scan = jiffies + + msecs_to_jiffies(p->numa_scan_period); + + return; + } + + /* + * Prepare to scale scan period relative to the current period. + * == NUMA_PERIOD_THRESHOLD scan period stays the same + * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) + * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) + */ + period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); + ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + if (ratio >= NUMA_PERIOD_THRESHOLD) { + int slot = ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else { + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; + + /* + * Scale scan rate increases based on sharing. There is an + * inverse relationship between the degree of sharing and + * the adjustment made to the scanning period. Broadly + * speaking the intent is that there is little point + * scanning faster if shared accesses dominate as it may + * simply bounce migrations uselessly + */ + period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + } + + p->numa_scan_period = clamp(p->numa_scan_period + diff, + task_scan_min(p), task_scan_max(p)); + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; + unsigned long fault_types[2] = { 0, 0 }; spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p) /* Decay existing window, copy faults since last scan */ p->numa_faults[i] >>= 1; p->numa_faults[i] += p->numa_faults_buffer[i]; + fault_types[priv] += p->numa_faults_buffer[i]; p->numa_faults_buffer[i] = 0; faults += p->numa_faults[i]; @@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p) } } + update_task_scan_period(p, fault_types[0], fault_types[1]); + if (p->numa_group) { /* * If the preferred task and group nids are different, @@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) BUG_ON(p->numa_faults_buffer); p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); p->total_numa_faults = 0; + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } /* @@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } - /* - * If pages are properly placed (did not migrate) then scan slower. - * This is reset periodically in case of phase changes - */ - if (!migrated) { - /* Initialise if necessary */ - if (!p->numa_scan_period_max) - p->numa_scan_period_max = task_scan_max(p); - - p->numa_scan_period = min(p->numa_scan_period_max, - p->numa_scan_period + 10); - } - task_numa_placement(p); /* Retry task to preferred node migration if it previously failed */ @@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) p->numa_pages_migrated += pages; p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; + p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -1701,18 +1775,6 @@ void task_numa_work(struct callback_head *work) } out: - /* - * If the whole process was scanned without updates then no NUMA - * hinting faults are being recorded and scan rate should be lower. - */ - if (mm->numa_scan_offset == 0 && !nr_pte_updates) { - p->numa_scan_period = min(p->numa_scan_period_max, - p->numa_scan_period << 1); - - next_scan = now + msecs_to_jiffies(p->numa_scan_period); - mm->numa_next_scan = next_scan; - } - /* * It is possible to reach the end of the VMA list but the last few * VMAs are not guaranteed to the vma_migratable. If they are not, we diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7ab4e32afe12..1be2a1f95b61 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1296,8 +1296,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page_nid = page_to_nid(page); last_cpupid = page_cpupid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == this_nid) + if (page_nid == this_nid) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + flags |= TNF_FAULT_LOCAL; + } /* * Avoid grouping on DSO/COW pages in specific and RO pages diff --git a/mm/memory.c b/mm/memory.c index 823720c43ea9..1c7501f7fb1a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3527,13 +3527,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int page_nid) + unsigned long addr, int page_nid, + int *flags) { get_page(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == numa_node_id()) + if (page_nid == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + *flags |= TNF_FAULT_LOCAL; + } return mpol_misplaced(page, vma, addr); } @@ -3593,7 +3596,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, page_nid); + target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { put_page(page); -- cgit v1.2.3 From 1e3646ffc64b232cb14a5ef01d7b98997c1b73f9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:38 +0100 Subject: mm: numa: Revert temporarily disabling of NUMA migration With the scan rate code working (at least for multi-instance specjbb), the large hammer that is "sched: Do not migrate memory immediately after switching node" can be replaced with something smarter. Revert temporarily migration disabling and all traces of numa_migrate_seq. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-61-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched/core.c | 2 -- kernel/sched/fair.c | 25 +------------------------ mm/mempolicy.c | 12 ------------ 4 files changed, 1 insertion(+), 39 deletions(-) (limited to 'mm') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2292f6c1596f..d24f70ffddee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1340,7 +1340,6 @@ struct task_struct { #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; - int numa_migrate_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; unsigned long numa_migrate_retry; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89c5ae836f66..0c3feebcf112 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1731,7 +1731,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = 1; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; @@ -4488,7 +4487,6 @@ void sched_setnuma(struct task_struct *p, int nid) p->sched_class->put_prev_task(rq, p); p->numa_preferred_nid = nid; - p->numa_migrate_seq = 1; if (running) p->sched_class->set_curr_task(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da6fa22be000..8454c38b1b12 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1261,16 +1261,8 @@ static void numa_migrate_preferred(struct task_struct *p) { /* Success if task is already running on preferred CPU */ p->numa_migrate_retry = 0; - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) { - /* - * If migration is temporarily disabled due to a task migration - * then re-enable it now as the task is running on its - * preferred node and memory should migrate locally - */ - if (!p->numa_migrate_seq) - p->numa_migrate_seq++; + if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) return; - } /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1)) @@ -1367,7 +1359,6 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; - p->numa_migrate_seq++; p->numa_scan_period_max = task_scan_max(p); /* If the task is part of a group prevent parallel updates to group stats */ @@ -4730,20 +4721,6 @@ static void move_task(struct task_struct *p, struct lb_env *env) set_task_cpu(p, env->dst_cpu); activate_task(env->dst_rq, p, 0); check_preempt_curr(env->dst_rq, p, 0); -#ifdef CONFIG_NUMA_BALANCING - if (p->numa_preferred_nid != -1) { - int src_nid = cpu_to_node(env->src_cpu); - int dst_nid = cpu_to_node(env->dst_cpu); - - /* - * If the load balancer has moved the task then limit - * migrations from taking place in the short term in - * case this is a short-lived migration. - */ - if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid) - p->numa_migrate_seq = 0; - } -#endif } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a5867ef24bda..2929c24c22b7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2404,18 +2404,6 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) goto out; - -#ifdef CONFIG_NUMA_BALANCING - /* - * If the scheduler has just moved us away from our - * preferred node, do not bother migrating pages yet. - * This way a short and temporary process migration will - * not cause excessive memory migration. - */ - if (thisnid != current->numa_preferred_nid && - !current->numa_migrate_seq) - goto out; -#endif } if (curnid != polnid) -- cgit v1.2.3 From de1c9ce6f07fec0381a39a9d0b379ea35aa1167f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:39 +0100 Subject: sched/numa: Skip some page migrations after a shared fault Shared faults can lead to lots of unnecessary page migrations, slowing down the system, and causing private faults to hit the per-pgdat migration ratelimit. This patch adds sysctl numa_balancing_migrate_deferred, which specifies how many shared page migrations to skip unconditionally, after each page migration that is skipped because it is a shared fault. This reduces the number of page migrations back and forth in shared fault situations. It also gives a strong preference to the tasks that are already running where most of the memory is, and to moving the other tasks to near the memory. Testing this with a much higher scan rate than the default still seems to result in fewer page migrations than before. Memory seems to be somewhat better consolidated than previously, with multi-instance specjbb runs on a 4 node system. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-62-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 10 ++++++++- include/linux/sched.h | 5 ++++- kernel/sched/fair.c | 8 +++++++ kernel/sysctl.c | 7 ++++++ mm/mempolicy.c | 48 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 75 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 84f17800f8b5..4273b2d71a27 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the feature is too high then the rate the kernel samples for NUMA hinting faults may be controlled by the numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, -numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls. +numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and +numa_balancing_migrate_deferred. ============================================================== @@ -421,6 +422,13 @@ the schedule balancer stops pushing the task towards a preferred node. This gives the scheduler a chance to place the task on an alternative node if the preferred node is overloaded. +numa_balancing_migrate_deferred is how many page migrations get skipped +unconditionally, after a page migration is skipped because a page is shared +with other tasks. This reduces page migration overhead, and determines +how much stronger the "move task near its memory" policy scheduler becomes, +versus the "move memory near its task" memory management policy, for workloads +with shared memory. + ============================================================== osrelease, ostype & version: diff --git a/include/linux/sched.h b/include/linux/sched.h index d24f70ffddee..833eed55cf43 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1342,6 +1342,8 @@ struct task_struct { int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; + int numa_preferred_nid; + int numa_migrate_deferred; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; @@ -1372,7 +1374,6 @@ struct task_struct { */ unsigned long numa_faults_locality[2]; - int numa_preferred_nid; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1469,6 +1470,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p); + +extern unsigned int sysctl_numa_balancing_migrate_deferred; #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8454c38b1b12..e7884dc3416d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -833,6 +833,14 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* + * After skipping a page migration on a shared page, skip N more numa page + * migrations unconditionally. This reduces the number of NUMA migrations + * in shared memory workloads, and has the effect of pulling tasks towards + * where their memory lives, over pulling the memory towards the task. + */ +unsigned int sysctl_numa_balancing_migrate_deferred = 16; + static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e509b90a8002..a159e1fd2013 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_migrate_deferred", + .data = &sysctl_numa_balancing_migrate_deferred, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2929c24c22b7..71cb253368cb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2301,6 +2301,35 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +#ifdef CONFIG_NUMA_BALANCING +static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + /* Never defer a private fault */ + if (cpupid_match_pid(p, last_cpupid)) + return false; + + if (p->numa_migrate_deferred) { + p->numa_migrate_deferred--; + return true; + } + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ + p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; +} +#else +static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2402,7 +2431,24 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * relation. */ last_cpupid = page_cpupid_xchg_last(page, this_cpupid); - if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) + if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { + + /* See sysctl_numa_balancing_migrate_deferred comment */ + if (!cpupid_match_pid(current, last_cpupid)) + defer_numa_migrate(current); + + goto out; + } + + /* + * The quadratic filter above reduces extraneous migration + * of shared pages somewhat. This code reduces it even more, + * reducing the overhead of page migrations of shared pages. + * This makes workloads with shared pages rely more on + * "move task near its memory", and less on "move memory + * towards its task", which is exactly what we want. + */ + if (numa_migrate_deferred(current, last_cpupid)) goto out; } -- cgit v1.2.3