summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/entry/entry_32.S4
-rw-r--r--arch/x86/hyperv/hv_init.c5
-rw-r--r--arch/x86/hyperv/mmu.c57
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/alternative.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h8
-rw-r--r--arch/x86/include/asm/mshyperv.h1
-rw-r--r--arch/x86/include/asm/tlbflush.h24
-rw-r--r--arch/x86/kernel/apic/apic.c15
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c27
-rw-r--r--arch/x86/kernel/kprobes/common.h13
-rw-r--r--arch/x86/kernel/kprobes/core.c2
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/unwind_frame.c38
-rw-r--r--arch/x86/mm/Makefile11
-rw-r--r--arch/x86/mm/tlb.c153
16 files changed, 284 insertions, 88 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 8a13d468635a..50e0d2bc4528 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -176,7 +176,7 @@
/*
* This is a sneaky trick to help the unwinder find pt_regs on the stack. The
* frame pointer is replaced with an encoded pointer to pt_regs. The encoding
- * is just setting the LSB, which makes it an invalid stack address and is also
+ * is just clearing the MSB, which makes it an invalid stack address and is also
* a signal to the unwinder that it's a pt_regs pointer in disguise.
*
* NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
@@ -185,7 +185,7 @@
.macro ENCODE_FRAME_POINTER
#ifdef CONFIG_FRAME_POINTER
mov %esp, %ebp
- orl $0x1, %ebp
+ andl $0x7fffffff, %ebp
#endif
.endm
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 1a8eb550c40f..a5db63f728a2 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
u32 *hv_vp_index;
EXPORT_SYMBOL_GPL(hv_vp_index);
+u32 hv_max_vp_index;
+
static int hv_cpu_init(unsigned int cpu)
{
u64 msr_vp_index;
@@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu)
hv_vp_index[smp_processor_id()] = msr_vp_index;
+ if (msr_vp_index > hv_max_vp_index)
+ hv_max_vp_index = msr_vp_index;
+
return 0;
}
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 39e7f6e50919..9cc9e1c1e2db 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex {
/* Each gva in gva_list encodes up to 4096 pages to flush */
#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
-static struct hv_flush_pcpu __percpu *pcpu_flush;
+static struct hv_flush_pcpu __percpu **pcpu_flush;
-static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex;
+static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex;
/*
* Fills in gva_list starting from offset. Returns the number of items added.
@@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
{
int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
+ /* valid_bank_mask can represent up to 64 banks */
+ if (hv_max_vp_index / 64 >= 64)
+ return 0;
+
+ /*
+ * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
+ * structs are not cleared between calls, we risk flushing unneeded
+ * vCPUs otherwise.
+ */
+ for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
+ flush->hv_vp_set.bank_contents[vcpu_bank] = 0;
+
/*
* Some banks may end up being empty but this is acceptable.
*/
@@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
vcpu = hv_cpu_number_to_vp_number(cpu);
vcpu_bank = vcpu / 64;
vcpu_offset = vcpu % 64;
-
- /* valid_bank_mask can represent up to 64 banks */
- if (vcpu_bank >= 64)
- return 0;
-
__set_bit(vcpu_offset, (unsigned long *)
&flush->hv_vp_set.bank_contents[vcpu_bank]);
if (vcpu_bank >= nr_bank)
@@ -102,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int cpu, vcpu, gva_n, max_gvas;
+ struct hv_flush_pcpu **flush_pcpu;
struct hv_flush_pcpu *flush;
u64 status = U64_MAX;
unsigned long flags;
@@ -116,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
local_irq_save(flags);
- flush = this_cpu_ptr(pcpu_flush);
+ flush_pcpu = this_cpu_ptr(pcpu_flush);
+
+ if (unlikely(!*flush_pcpu))
+ *flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+ flush = *flush_pcpu;
+
+ if (unlikely(!flush)) {
+ local_irq_restore(flags);
+ goto do_native;
+ }
if (info->mm) {
flush->address_space = virt_to_phys(info->mm->pgd);
@@ -173,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int nr_bank = 0, max_gvas, gva_n;
+ struct hv_flush_pcpu_ex **flush_pcpu;
struct hv_flush_pcpu_ex *flush;
u64 status = U64_MAX;
unsigned long flags;
@@ -187,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
local_irq_save(flags);
- flush = this_cpu_ptr(pcpu_flush_ex);
+ flush_pcpu = this_cpu_ptr(pcpu_flush_ex);
+
+ if (unlikely(!*flush_pcpu))
+ *flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+ flush = *flush_pcpu;
+
+ if (unlikely(!flush)) {
+ local_irq_restore(flags);
+ goto do_native;
+ }
if (info->mm) {
flush->address_space = virt_to_phys(info->mm->pgd);
@@ -222,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
- 0, nr_bank + 2, flush, NULL);
+ 0, nr_bank, flush, NULL);
} else if (info->end &&
((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
- 0, nr_bank + 2, flush, NULL);
+ 0, nr_bank, flush, NULL);
} else {
gva_n = fill_gva_list(flush->gva_list, nr_bank,
info->start, info->end);
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
- gva_n, nr_bank + 2, flush, NULL);
+ gva_n, nr_bank, flush, NULL);
}
local_irq_restore(flags);
@@ -266,7 +295,7 @@ void hyper_alloc_mmu(void)
return;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
- pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+ pcpu_flush = alloc_percpu(struct hv_flush_pcpu *);
else
- pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+ pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *);
}
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e7636bac7372..6c98821fef5e 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -62,8 +62,10 @@
#define new_len2 145f-144f
/*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
* http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
*/
#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c096624137ae..ccbe24e697c4 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
alt_end_marker ":\n"
/*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
* http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
*
- * The additional "-" is needed because gas works with s32s.
+ * The additional "-" is needed because gas uses a "true" value of -1.
*/
-#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
+#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))"
/*
* Pad the second replacement alternative with additional NOPs if it is
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index c120b5db178a..3c856a15b98e 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
DEBUG_LOCKS_WARN_ON(preemptible());
}
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
- int cpu = smp_processor_id();
-
- if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
- cpumask_clear_cpu(cpu, mm_cpumask(mm));
-}
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 738503e1f80c..530f448fddaf 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
* to this information.
*/
extern u32 *hv_vp_index;
+extern u32 hv_max_vp_index;
/**
* hv_cpu_number_to_vp_number() - Map CPU to VP.
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4893abf7f74f..d362161d3291 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -83,6 +83,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
#endif
/*
+ * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point
+ * to init_mm when we switch to a kernel thread (e.g. the idle thread). If
+ * it's false, then we immediately switch CR3 when entering a kernel thread.
+ */
+DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
+/*
* 6 because 6 should be plenty and struct tlb_state will fit in
* two cache lines.
*/
@@ -105,6 +112,23 @@ struct tlb_state {
u16 next_asid;
/*
+ * We can be in one of several states:
+ *
+ * - Actively using an mm. Our CPU's bit will be set in
+ * mm_cpumask(loaded_mm) and is_lazy == false;
+ *
+ * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
+ * will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+ *
+ * - Lazily using a real mm. loaded_mm != &init_mm, our bit
+ * is set in mm_cpumask(loaded_mm), but is_lazy == true.
+ * We're heuristically guessing that the CR3 load we
+ * skipped more than makes up for the overhead added by
+ * lazy mode.
+ */
+ bool is_lazy;
+
+ /*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
*/
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d705c769f77d..ff891772c9f8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void)
return ~0U;
}
+static u32 skx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_mask) {
+ case 0x03: return 0x01000136;
+ case 0x04: return 0x02000014;
+ }
+
+ return ~0U;
+}
+
static const struct x86_cpu_id deadline_match[] = {
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014),
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
@@ -600,7 +610,8 @@ static void apic_check_deadline_errata(void)
const struct x86_cpu_id *m;
u32 rev;
- if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR))
return;
m = x86_match_cpu(deadline_match);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 86e8f0b2537b..c4fa4a85d4cb 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void)
bool *res = &dis_ucode_ldr;
#endif
- if (!have_cpuid_p())
- return *res;
-
/*
* CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
* completely accurate as xen pv guests don't see that CPUID bit set but
@@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
void __init load_ucode_bsp(void)
{
unsigned int cpuid_1_eax;
+ bool intel = true;
- if (check_loader_disabled_bsp())
+ if (!have_cpuid_p())
return;
cpuid_1_eax = native_cpuid_eax(1);
switch (x86_cpuid_vendor()) {
case X86_VENDOR_INTEL:
- if (x86_family(cpuid_1_eax) >= 6)
- load_ucode_intel_bsp();
+ if (x86_family(cpuid_1_eax) < 6)
+ return;
break;
+
case X86_VENDOR_AMD:
- if (x86_family(cpuid_1_eax) >= 0x10)
- load_ucode_amd_bsp(cpuid_1_eax);
+ if (x86_family(cpuid_1_eax) < 0x10)
+ return;
+ intel = false;
break;
+
default:
- break;
+ return;
}
+
+ if (check_loader_disabled_bsp())
+ return;
+
+ if (intel)
+ load_ucode_intel_bsp();
+ else
+ load_ucode_amd_bsp(cpuid_1_eax);
}
static bool check_loader_disabled_ap(void)
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index db2182d63ed0..3fc0f9a794cb 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -3,6 +3,15 @@
/* Kprobes and Optprobes common header */
+#include <asm/asm.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
+ " mov %" _ASM_SP ", %" _ASM_BP "\n"
+#else
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n"
+#endif
+
#ifdef CONFIG_X86_64
#define SAVE_REGS_STRING \
/* Skip cs, ip, orig_ax. */ \
@@ -17,7 +26,7 @@
" pushq %r10\n" \
" pushq %r11\n" \
" pushq %rbx\n" \
- " pushq %rbp\n" \
+ SAVE_RBP_STRING \
" pushq %r12\n" \
" pushq %r13\n" \
" pushq %r14\n" \
@@ -48,7 +57,7 @@
" pushl %es\n" \
" pushl %ds\n" \
" pushl %eax\n" \
- " pushl %ebp\n" \
+ SAVE_RBP_STRING \
" pushl %edi\n" \
" pushl %esi\n" \
" pushl %edx\n" \
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index f0153714ddac..0742491cbb73 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
* raw stack chunk with redzones:
*/
__memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr));
- regs->flags &= ~X86_EFLAGS_IF;
- trace_hardirqs_off();
regs->ip = (unsigned long)(jp->entry);
/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 54180fa6f66f..add33f600531 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type)
load_cr3(initial_page_table);
#else
write_cr3(real_mode_header->trampoline_pgd);
+
+ /* Exiting long mode will fail if CR4.PCIDE is set. */
+ if (static_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
#endif
/* Jump to the identity-mapped low memory code */
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d145a0b1f529..3dc26f95d46e 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state)
state->stack_info.type, state->stack_info.next_sp,
state->stack_mask, state->graph_idx);
- for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+ sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
break;
@@ -174,6 +175,7 @@ static bool is_last_task_frame(struct unwind_state *state)
* This determines if the frame pointer actually contains an encoded pointer to
* pt_regs on the stack. See ENCODE_FRAME_POINTER.
*/
+#ifdef CONFIG_X86_64
static struct pt_regs *decode_frame_pointer(unsigned long *bp)
{
unsigned long regs = (unsigned long)bp;
@@ -183,6 +185,23 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
return (struct pt_regs *)(regs & ~0x1);
}
+#else
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (regs & 0x80000000)
+ return NULL;
+
+ return (struct pt_regs *)(regs | 0x80000000);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
+#else
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
+#endif
static bool update_stack_state(struct unwind_state *state,
unsigned long *next_bp)
@@ -202,7 +221,7 @@ static bool update_stack_state(struct unwind_state *state,
regs = decode_frame_pointer(next_bp);
if (regs) {
frame = (unsigned long *)regs;
- len = regs_size(regs);
+ len = KERNEL_REGS_SIZE;
state->got_irq = true;
} else {
frame = next_bp;
@@ -226,6 +245,14 @@ static bool update_stack_state(struct unwind_state *state,
frame < prev_frame_end)
return false;
+ /*
+ * On 32-bit with user mode regs, make sure the last two regs are safe
+ * to access:
+ */
+ if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
+ !on_stack(info, frame, len + 2*sizeof(long)))
+ return false;
+
/* Move state to the next frame: */
if (regs) {
state->regs = regs;
@@ -328,6 +355,13 @@ bad_address:
state->regs->sp < (unsigned long)task_pt_regs(state->task))
goto the_end;
+ /*
+ * There are some known frame pointer issues on 32-bit. Disable
+ * unwinder warnings on 32-bit until it gets objtool support.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto the_end;
+
if (state->regs) {
printk_deferred_once(KERN_WARNING
"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 72bf8c01c6e3..e1f095884386 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,12 @@
-# Kernel does not boot with instrumentation of tlb.c.
-KCOV_INSTRUMENT_tlb.o := n
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_mem_encrypt.o := n
+
+KASAN_SANITIZE_mem_encrypt.o := n
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_mem_encrypt.o = -pg
+endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o setup_nx.o tlb.o
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49d9778376d7..658bf0090565 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,8 @@
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
u16 *new_asid, bool *need_flush)
{
@@ -80,7 +82,7 @@ void leave_mm(int cpu)
return;
/* Warn if we're not lazy. */
- WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
+ WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
@@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
__flush_tlb_all();
}
#endif
+ this_cpu_write(cpu_tlbstate.is_lazy, false);
if (real_prev == next) {
VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
- if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * There's nothing to do: we weren't lazy, and we
- * aren't changing our mm. We don't need to flush
- * anything, nor do we need to update CR3, CR4, or
- * LDTR.
- */
- return;
- }
-
- /* Resume remote flushes and then read tlb_gen. */
- cpumask_set_cpu(cpu, mm_cpumask(next));
- next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
- if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
- next_tlb_gen) {
- /*
- * Ideally, we'd have a flush_tlb() variant that
- * takes the known CR3 value as input. This would
- * be faster on Xen PV and on hypothetical CPUs
- * on which INVPCID is fast.
- */
- this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
- next_tlb_gen);
- write_cr3(build_cr3(next, prev_asid));
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
- TLB_FLUSH_ALL);
- }
-
/*
- * We just exited lazy mode, which means that CR4 and/or LDTR
- * may be stale. (Changes to the required CR4 and LDTR states
- * are not reflected in tlb_gen.)
+ * We don't currently support having a real mm loaded without
+ * our cpu set in mm_cpumask(). We have all the bookkeeping
+ * in place to figure out whether we would need to flush
+ * if our cpu were cleared in mm_cpumask(), but we don't
+ * currently use it.
*/
+ if (WARN_ON_ONCE(real_prev != &init_mm &&
+ !cpumask_test_cpu(cpu, mm_cpumask(next))))
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ return;
} else {
u16 new_asid;
bool need_flush;
@@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
/* Stop remote flushes for the previous mm */
- if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
- VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+ real_prev != &init_mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
/*
* Start remote flushes and then read tlb_gen.
@@ -233,6 +213,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
/*
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm. Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row. It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+ return;
+
+ if (static_branch_unlikely(&tlb_use_lazy_mode)) {
+ /*
+ * There's a significant optimization that may be possible
+ * here. We have accurate enough TLB flush tracking that we
+ * don't need to maintain coherence of TLB per se when we're
+ * lazy. We do, however, need to maintain coherence of
+ * paging-structure caches. We could, in principle, leave our
+ * old mm loaded and only switch to init_mm when
+ * tlb_remove_page() happens.
+ */
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
+ } else {
+ switch_mm(NULL, &init_mm, NULL);
+ }
+}
+
+/*
* Call this when reinitializing a CPU. It fixes the following potential
* problems:
*
@@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
+ if (unlikely(loaded_mm == &init_mm))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
- if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+ if (this_cpu_read(cpu_tlbstate.is_lazy)) {
/*
- * We're in lazy mode -- don't flush. We can get here on
- * remote flushes due to races and on local flushes if a
- * kernel thread coincidentally flushes the mm it's lazily
- * still using.
+ * We're in lazy mode. We need to at least flush our
+ * paging-structure cache to avoid speculatively reading
+ * garbage into our TLB. Since switching to init_mm is barely
+ * slower than a minimal flush, just switch to init_mm.
*/
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
}
@@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void)
return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);
+
+static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[2];
+
+ buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
+ buf[1] = '\n';
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t tlblazy_write_file(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ bool val;
+
+ if (kstrtobool_from_user(user_buf, count, &val))
+ return -EINVAL;
+
+ if (val)
+ static_branch_enable(&tlb_use_lazy_mode);
+ else
+ static_branch_disable(&tlb_use_lazy_mode);
+
+ return count;
+}
+
+static const struct file_operations fops_tlblazy = {
+ .read = tlblazy_read_file,
+ .write = tlblazy_write_file,
+ .llseek = default_llseek,
+};
+
+static int __init init_tlb_use_lazy_mode(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCID)) {
+ /*
+ * Heuristic: with PCID on, switching to and from
+ * init_mm is reasonably fast, but remote flush IPIs
+ * as expensive as ever, so turn off lazy TLB mode.
+ *
+ * We can't do this in setup_pcid() because static keys
+ * haven't been initialized yet, and it would blow up
+ * badly.
+ */
+ static_branch_disable(&tlb_use_lazy_mode);
+ }
+
+ debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
+ arch_debugfs_dir, NULL, &fops_tlblazy);
+ return 0;
+}
+late_initcall(init_tlb_use_lazy_mode);