diff options
Diffstat (limited to 'arch/x86')
42 files changed, 549 insertions, 244 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b39975977c03..fdc2e3abd615 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -305,6 +305,18 @@ ifeq ($(RETPOLINE_CFLAGS),) endif endif +ifdef CONFIG_UNWINDER_ORC +orc_hash_h := arch/$(SRCARCH)/include/generated/asm/orc_hash.h +orc_hash_sh := $(srctree)/scripts/orc_hash.sh +targets += $(orc_hash_h) +quiet_cmd_orc_hash = GEN $@ + cmd_orc_hash = mkdir -p $(dir $@); \ + $(CONFIG_SHELL) $(orc_hash_sh) < $< > $@ +$(orc_hash_h): $(srctree)/arch/x86/include/asm/orc_types.h $(orc_hash_sh) FORCE + $(call if_changed,orc_hash) +archprepare: $(orc_hash_h) +endif + archclean: $(Q)rm -rf $(objtree)/arch/i386 $(Q)rm -rf $(objtree)/arch/x86_64 diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S index 7c1abc513f34..9556dacd9841 100644 --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -773,8 +773,6 @@ .octa 0x3F893781E95FE1576CDA64D2BA0CB204 #ifdef CONFIG_AS_GFNI -.section .rodata.cst8, "aM", @progbits, 8 -.align 8 /* AES affine: */ #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) .Ltf_aff_bitmatrix: diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 070cc4ef2672..27f3a7b34bd5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -349,6 +349,16 @@ static struct event_constraint intel_spr_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_gnr_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); @@ -4074,7 +4084,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) if (x86_pmu.intel_cap.pebs_baseline) { arr[(*nr)++] = (struct perf_guest_switch_msr){ .msr = MSR_PEBS_DATA_CFG, - .host = cpuc->pebs_data_cfg, + .host = cpuc->active_pebs_data_cfg, .guest = kvm_pmu->pebs_data_cfg, }; } @@ -6496,6 +6506,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_EMERALDRAPIDS_X: x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + x86_pmu.extra_regs = intel_spr_extra_regs; fallthrough; case INTEL_FAM6_GRANITERAPIDS_X: case INTEL_FAM6_GRANITERAPIDS_D: @@ -6506,7 +6517,8 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_spr_event_constraints; x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; - x86_pmu.extra_regs = intel_spr_extra_regs; + if (!x86_pmu.extra_regs) + x86_pmu.extra_regs = intel_gnr_extra_regs; x86_pmu.limit_period = spr_limit_period; x86_pmu.pebs_ept = 1; x86_pmu.pebs_aliases = NULL; @@ -6650,6 +6662,7 @@ __init int intel_pmu_init(void) pmu->pebs_constraints = intel_grt_pebs_event_constraints; pmu->extra_regs = intel_grt_extra_regs; if (is_mtl(boot_cpu_data.x86_model)) { + x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_gnr_extra_regs; x86_pmu.pebs_latency_data = mtl_latency_data_small; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index fa9b209a11fa..d49e90dc04a4 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6150,6 +6150,7 @@ static struct intel_uncore_type spr_uncore_mdf = { }; #define UNCORE_SPR_NUM_UNCORE_TYPES 12 +#define UNCORE_SPR_CHA 0 #define UNCORE_SPR_IIO 1 #define UNCORE_SPR_IMC 6 #define UNCORE_SPR_UPI 8 @@ -6460,12 +6461,22 @@ static int uncore_type_max_boxes(struct intel_uncore_type **types, return max + 1; } +#define SPR_MSR_UNC_CBO_CONFIG 0x2FFE + void spr_uncore_cpu_init(void) { + struct intel_uncore_type *type; + u64 num_cbo; + uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, UNCORE_SPR_MSR_EXTRA_UNCORES, spr_msr_uncores); + type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA); + if (type) { + rdmsrl(SPR_MSR_UNC_CBO_CONFIG, num_cbo); + type->num_boxes = num_cbo; + } spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO); } diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a5f9474f08e1..6c04b52f139b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -416,7 +416,7 @@ void __init hyperv_init(void) goto free_vp_assist_page; } - cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online", hv_cpu_init, hv_cpu_die); if (cpuhp < 0) goto free_ghcb_page; diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 1ba5d3b99b16..85d38b9f3586 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -20,6 +20,8 @@ void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = x86_init_noop; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 1e51650b79d7..4f1ce5fc4e19 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +generated-y += orc_hash.h generated-y += syscalls_32.h generated-y += syscalls_64.h generated-y += syscalls_x32.h diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index c2d6cd78ed0c..78fcde7b1f07 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -39,7 +39,7 @@ extern void fpu_flush_thread(void); static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { if (cpu_feature_enabled(X86_FEATURE_FPU) && - !(current->flags & (PF_KTHREAD | PF_IO_WORKER))) { + !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index c17e3e96fc1d..6c98f4bb4228 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -13,7 +13,6 @@ BUILD_BUG_ON(1) * at the call sites. */ KVM_X86_PMU_OP(hw_event_available) -KVM_X86_PMU_OP(pmc_is_enabled) KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) KVM_X86_PMU_OP(msr_idx_to_pmc) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fb9d1f2d6136..28bd38303d70 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -523,7 +523,7 @@ struct kvm_pmu { u64 global_status; u64 counter_bitmask[2]; u64 global_ctrl_mask; - u64 global_ovf_ctrl_mask; + u64 global_status_mask; u64 reserved_bits; u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; diff --git a/arch/x86/include/asm/orc_header.h b/arch/x86/include/asm/orc_header.h new file mode 100644 index 000000000000..07bacf3e160e --- /dev/null +++ b/arch/x86/include/asm/orc_header.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#ifndef _ORC_HEADER_H +#define _ORC_HEADER_H + +#include <linux/types.h> +#include <linux/compiler.h> +#include <asm/orc_hash.h> + +/* + * The header is currently a 20-byte hash of the ORC entry definition; see + * scripts/orc_hash.sh. + */ +#define ORC_HEADER \ + __used __section(".orc_header") __aligned(4) \ + static const u8 orc_header[] = { ORC_HASH } + +#endif /* _ORC_HEADER_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index dd61752f4c96..4070a01c11b7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,6 +17,7 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg CFLAGS_REMOVE_sev.o = -pg +CFLAGS_REMOVE_rethook.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6bde05a86b4e..896bc41cb2ba 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -97,7 +97,10 @@ static void init_x2apic_ldr(void) static int x2apic_phys_probe(void) { - if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) return 1; return apic == &apic_x2apic_phys; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 5e868b62a7c4..0270925fe013 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -79,7 +79,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) * initial apic id, which also represents 32-bit extended x2apic id. */ c->initial_apicid = edx; - smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); #endif return 0; } @@ -109,7 +109,8 @@ int detect_extended_topology(struct cpuinfo_x86 *c) */ cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); c->initial_apicid = edx; - core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0bf6779187dd..f18ca44c904b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -195,7 +195,6 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); - stack = stack ? : get_stack_pointer(task, regs); regs = unwind_get_entry_regs(&state, &partial); /* @@ -214,9 +213,13 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for (stack = stack ?: get_stack_pointer(task, regs); + stack; + stack = stack_info.next_sp) { const char *stack_name; + stack = PTR_ALIGN(stack, sizeof(long)); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { /* * We weren't on a valid stack. It's possible that diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index 9fcfa5c4dad7..af5cbdd9bd29 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void) struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id(); - if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER))) + if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index caf33486dc5e..1015af1ae562 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -426,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) this_cpu_write(in_kernel_fpu, true); - if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) && + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(¤t->thread.fpu); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a5df3e994f04..113c13376e51 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -77,6 +77,15 @@ SYM_CODE_START_NOALIGN(startup_64) call startup_64_setup_env popq %rsi + /* Now switch to __KERNEL_CS so IRET works reliably */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + UNWIND_HINT_END_OF_STACK + #ifdef CONFIG_AMD_MEM_ENCRYPT /* * Activate SEV/SME memory encryption if supported/enabled. This needs to @@ -90,15 +99,6 @@ SYM_CODE_START_NOALIGN(startup_64) popq %rsi #endif - /* Now switch to __KERNEL_CS so IRET works reliably */ - pushq $__KERNEL_CS - leaq .Lon_kernel_cs(%rip), %rax - pushq %rax - lretq - -.Lon_kernel_cs: - UNWIND_HINT_END_OF_STACK - /* Sanitize CPU configuration */ call verify_cpu diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 3ac50b7298d1..4d8e518365f4 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -7,6 +7,9 @@ #include <asm/unwind.h> #include <asm/orc_types.h> #include <asm/orc_lookup.h> +#include <asm/orc_header.h> + +ORC_HEADER; #define orc_warn(fmt, ...) \ printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0c9660a07b23..7f4d13383cf2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -501,20 +501,15 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { - int r; - - r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) - goto out; - r = -EFAULT; + return -E2BIG; + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - return 0; + return -EFAULT; -out: cpuid->nent = vcpu->arch.cpuid_nent; - return r; + return 0; } /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ @@ -734,6 +729,10 @@ void kvm_set_cpu_caps(void) F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ ); + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, + F(PERFMON_V2) + ); + /* * Synthesize "LFENCE is serializing" into the AMD-defined entry in * KVM's supported CPUID if the feature is reported as supported by the @@ -948,7 +947,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) union cpuid10_eax eax; union cpuid10_edx edx; - if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } @@ -1128,7 +1127,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x80000021); + entry->eax = min(entry->eax, 0x80000022); /* * Serializing LFENCE is reported in a multitude of ways, and * NullSegClearsBase is not reported in CPUID on Zen2; help @@ -1233,6 +1232,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = entry->ecx = entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); break; + /* AMD Extended Performance Monitoring and Debug */ + case 0x80000022: { + union cpuid_0x80000022_ebx ebx; + + entry->ecx = entry->edx = 0; + if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { + entry->eax = entry->ebx; + break; + } + + cpuid_entry_override(entry, CPUID_8000_0022_EAX); + + if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; + else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; + else + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; + + entry->ebx = ebx.full; + break; + } /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4756bcb5724f..8dec646e764b 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -411,7 +411,10 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) pic_clear_isr(s, ret); if (addr1 >> 7 || ret != 2) pic_update_irq(s->pics_state); + /* Bit 7 is 1, means there's an interrupt */ + ret |= 0x80; } else { + /* Bit 7 is 0, means there's no interrupt */ ret = 0x07; pic_update_irq(s->pics_state); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e542cf285b51..113ca9661ab2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -51,11 +51,6 @@ #define mod_64(x, y) ((x) % (y)) #endif -#define PRId64 "d" -#define PRIx64 "llx" -#define PRIu64 "u" -#define PRIo64 "o" - /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) @@ -229,6 +224,23 @@ static int kvm_recalculate_phys_map(struct kvm_apic_map *new, u32 physical_id; /* + * For simplicity, KVM always allocates enough space for all possible + * xAPIC IDs. Yell, but don't kill the VM, as KVM can continue on + * without the optimized map. + */ + if (WARN_ON_ONCE(xapic_id > new->max_apic_id)) + return -EINVAL; + + /* + * Bail if a vCPU was added and/or enabled its APIC between allocating + * the map and doing the actual calculations for the map. Note, KVM + * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if + * the compiler decides to reload x2apic_id after this check. + */ + if (x2apic_id > new->max_apic_id) + return -E2BIG; + + /* * Deliberately truncate the vCPU ID when detecting a mismatched APIC * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a * 32-bit value. Any unwanted aliasing due to truncation results will @@ -253,8 +265,7 @@ static int kvm_recalculate_phys_map(struct kvm_apic_map *new, */ if (vcpu->kvm->arch.x2apic_format) { /* See also kvm_apic_match_physical_addr(). */ - if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && - x2apic_id <= new->max_apic_id) + if (apic_x2apic_mode(apic) || x2apic_id > 0xff) new->phys_map[x2apic_id] = apic; if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8961f45e3b1..03ff06cd65b3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -58,6 +58,8 @@ extern bool itlb_multihit_kvm_mitigation; +static bool nx_hugepage_mitigation_hard_disabled; + int __read_mostly nx_huge_pages = -1; static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT @@ -67,12 +69,13 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, - .get = param_get_bool, + .get = get_nx_huge_pages, }; static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { @@ -5797,6 +5800,14 @@ static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu vcpu_clear_mmio_info(vcpu, addr); + /* + * Walking and synchronizing SPTEs both assume they are operating in + * the context of the current MMU, and would need to be reworked if + * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT. + */ + if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) + return; + if (!VALID_PAGE(root_hpa)) return; @@ -6844,6 +6855,14 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) +{ + if (nx_hugepage_mitigation_hard_disabled) + return sprintf(buffer, "never\n"); + + return param_get_bool(buffer, kp); +} + static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ @@ -6860,15 +6879,29 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) bool old_val = nx_huge_pages; bool new_val; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + /* In "auto" mode deploy workaround only if CPU has the bug. */ - if (sysfs_streq(val, "off")) + if (sysfs_streq(val, "off")) { new_val = 0; - else if (sysfs_streq(val, "force")) + } else if (sysfs_streq(val, "force")) { new_val = 1; - else if (sysfs_streq(val, "auto")) + } else if (sysfs_streq(val, "auto")) { new_val = get_nx_auto_mode(); - else if (kstrtobool(val, &new_val) < 0) + } else if (sysfs_streq(val, "never")) { + new_val = 0; + + mutex_lock(&kvm_lock); + if (!list_empty(&vm_list)) { + mutex_unlock(&kvm_lock); + return -EBUSY; + } + nx_hugepage_mitigation_hard_disabled = true; + mutex_unlock(&kvm_lock); + } else if (kstrtobool(val, &new_val) < 0) { return -EINVAL; + } __set_nx_huge_pages(new_val); @@ -7006,6 +7039,9 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel uint old_period, new_period; int err; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); @@ -7091,7 +7127,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) */ slot = NULL; if (atomic_read(&kvm->nr_memslots_dirty_logging)) { - slot = gfn_to_memslot(kvm, sp->gfn); + struct kvm_memslots *slots; + + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, sp->gfn); WARN_ON_ONCE(!slot); } @@ -7161,6 +7200,9 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) { int err; + if (nx_hugepage_mitigation_hard_disabled) + return 0; + err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, "kvm-nx-lpage-recovery", &kvm->arch.nx_huge_page_recovery_thread); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 08340219c35a..512163d52194 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -592,7 +592,10 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and - * does not hold the mmu_lock. + * does not hold the mmu_lock. On failure, i.e. if a different logical + * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with + * the current value, so the caller operates on fresh data, e.g. if it + * retries tdp_mmu_set_spte_atomic() */ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 9fac1ec03463..3eb6e7f47e96 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -25,10 +25,24 @@ #define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) #define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) +static bool is_mtrr_base_msr(unsigned int msr) +{ + /* MTRR base MSRs use even numbers, masks use odd numbers. */ + return !(msr & 0x1); +} + +static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, + unsigned int msr) +{ + int index = (msr - MTRRphysBase_MSR(0)) / 2; + + return &vcpu->arch.mtrr_state.var_ranges[index]; +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1): case MSR_MTRRfix64K_00000: case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: @@ -41,7 +55,6 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRfix4K_F0000: case MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: - case MSR_IA32_CR_PAT: return true; } return false; @@ -52,7 +65,7 @@ static bool valid_mtrr_type(unsigned t) return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ } -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int i; u64 mask; @@ -60,9 +73,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!msr_mtrr_valid(msr)) return false; - if (msr == MSR_IA32_CR_PAT) { - return kvm_pat_valid(data); - } else if (msr == MSR_MTRRdefType) { + if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; return valid_mtrr_type(data & 0xff); @@ -74,7 +85,8 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + WARN_ON(!(msr >= MTRRphysBase_MSR(0) && + msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))); mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { @@ -88,7 +100,6 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return (data & mask) == 0; } -EXPORT_SYMBOL_GPL(kvm_mtrr_valid); static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) { @@ -308,10 +319,8 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end; - int index; - if (msr == MSR_IA32_CR_PAT || !tdp_enabled || - !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) return; if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) @@ -326,8 +335,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) end = ~0ULL; } else { /* variable range MTRRs. */ - index = (msr - 0x200) / 2; - var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end); + var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end); } kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); @@ -342,21 +350,18 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; struct kvm_mtrr_range *tmp, *cur; - int index, is_mtrr_mask; - index = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * index; - cur = &mtrr_state->var_ranges[index]; + cur = var_mtrr_msr_to_range(vcpu, msr); /* remove the entry if it's in the list. */ if (var_mtrr_range_is_valid(cur)) - list_del(&mtrr_state->var_ranges[index].node); + list_del(&cur->node); /* * Set all illegal GPA bits in the mask, since those bits must * implicitly be 0. The bits are then cleared when reading them. */ - if (!is_mtrr_mask) + if (is_mtrr_base_msr(msr)) cur->base = data; else cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); @@ -382,8 +387,6 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data; else if (msr == MSR_MTRRdefType) vcpu->arch.mtrr_state.deftype = data; - else if (msr == MSR_IA32_CR_PAT) - vcpu->arch.pat = data; else set_var_mtrr_msr(vcpu, msr, data); @@ -411,21 +414,16 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 1; index = fixed_msr_to_range_index(msr); - if (index >= 0) + if (index >= 0) { *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index]; - else if (msr == MSR_MTRRdefType) + } else if (msr == MSR_MTRRdefType) { *pdata = vcpu->arch.mtrr_state.deftype; - else if (msr == MSR_IA32_CR_PAT) - *pdata = vcpu->arch.pat; - else { /* Variable MTRRs */ - int is_mtrr_mask; - - index = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * index; - if (!is_mtrr_mask) - *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; + } else { + /* Variable MTRRs */ + if (is_mtrr_base_msr(msr)) + *pdata = var_mtrr_msr_to_range(vcpu, msr)->base; else - *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; + *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask; *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 1690d41c1830..bf653df86112 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -93,11 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) -{ - return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); -} - static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); @@ -562,6 +557,14 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); + default: + break; + } return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } @@ -577,13 +580,86 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + msr_info->data = pmu->global_status; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_CORE_PERF_GLOBAL_CTRL: + msr_info->data = pmu->global_ctrl; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + msr_info->data = 0; + break; + default: + return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + } + + return 0; } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + u64 data = msr_info->data; + u64 diff; + + /* + * Note, AMD ignores writes to reserved bits and read-only PMU MSRs, + * whereas Intel generates #GP on attempts to write reserved/RO MSRs. + */ + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + if (!msr_info->host_initiated) + return 1; /* RO MSR */ + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + /* Per PPR, Read-only MSR. Writes are ignored. */ + if (!msr_info->host_initiated) + break; + + if (data & pmu->global_status_mask) + return 1; + + pmu->global_status = data; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + data &= ~pmu->global_ctrl_mask; + fallthrough; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (!kvm_valid_perf_global_ctrl(pmu, data)) + return 1; + + if (pmu->global_ctrl != data) { + diff = pmu->global_ctrl ^ data; + pmu->global_ctrl = data; + reprogram_counters(pmu, diff); + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + /* + * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in + * GLOBAL_STATUS, and so the set of reserved bits is the same. + */ + if (data & pmu->global_status_mask) + return 1; + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + break; + default: + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); + return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + } + + return 0; } /* refresh PMU settings. This function generally is called when underlying diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5c7bbf03b599..7d9ba301c090 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -20,7 +20,6 @@ struct kvm_pmu_ops { bool (*hw_event_available)(struct kvm_pmc *pmc); - bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); @@ -37,10 +36,25 @@ struct kvm_pmu_ops { const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; + const int MIN_NR_GP_COUNTERS; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); +static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + * + * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2. + */ + return pmu->version > 1; +} + static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -161,6 +175,7 @@ extern struct x86_pmu_capability kvm_pmu_cap; static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; /* * Hybrid PMUs don't play nice with virtualization without careful @@ -175,11 +190,15 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) perf_get_x86_pmu_capability(&kvm_pmu_cap); /* - * For Intel, only support guest architectural pmu - * on a host with architectural pmu. + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. */ - if ((is_intel && !kvm_pmu_cap.version) || - !kvm_pmu_cap.num_counters_gp) + if (!kvm_pmu_cap.num_counters_gp || + WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_pmu_cap.version) enable_pmu = false; } @@ -201,6 +220,33 @@ static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } +static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff) +{ + int bit; + + if (!diff) + return; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + set_bit(bit, pmu->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); +} + +/* + * Check if a PMC is enabled by comparing it against global_ctrl bits. + * + * If the vPMU doesn't have global_ctrl MSR, all vPMCs are enabled. + */ +static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (!kvm_pmu_has_perf_global_ctrl(pmu)) + return true; + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index a5717282bb9c..56cbdb24400a 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -15,6 +15,7 @@ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, CPUID_7_1_EDX, CPUID_8000_0007_EDX, + CPUID_8000_0022_EAX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -47,6 +48,9 @@ enum kvm_only_cpuid_leafs { /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) +/* CPUID level 0x80000022 (EAX) */ +#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0) + struct cpuid_reg { u32 function; u32 index; @@ -74,6 +78,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX}, [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX}, [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, + [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, }; /* @@ -108,6 +113,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX_EDECCSSA; else if (x86_feature == X86_FEATURE_CONSTANT_TSC) return KVM_X86_FEATURE_CONSTANT_TSC; + else if (x86_feature == X86_FEATURE_PERFMON_V2) + return KVM_X86_FEATURE_PERFMON_V2; return x86_feature; } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 5fa939e411d8..cef5a3d0abd0 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -78,14 +78,6 @@ static bool amd_hw_event_available(struct kvm_pmc *pmc) return true; } -/* check if a PMC is enabled by comparing it against global_ctrl bits. Because - * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). - */ -static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) -{ - return true; -} - static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -102,12 +94,6 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30)); } -static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) -{ - /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ - return false; -} - static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -119,6 +105,29 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + switch (msr) { + case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: + return pmu->version > 0; + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + return pmu->version > 1; + default: + if (msr > MSR_F15H_PERF_CTR5 && + msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters) + return pmu->version > 1; + break; + } + + return amd_msr_idx_to_pmc(vcpu, msr); +} + static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -172,20 +181,39 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void amd_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + union cpuid_0x80000022_ebx ebx; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + pmu->version = 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { + pmu->version = 2; + /* + * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest + * CPUID entry is guaranteed to be non-NULL. + */ + BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 || + x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); + ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; + pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; + } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; - else + } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + } + + pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters, + kvm_pmu_cap.num_counters_gp); + + if (pmu->version > 1) { + pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_mask = pmu->global_ctrl_mask; + } pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; - pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; - pmu->global_status = 0; bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } @@ -216,11 +244,12 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } + + pmu->global_ctrl = pmu->global_status = 0; } struct kvm_pmu_ops amd_pmu_ops __initdata = { .hw_event_available = amd_hw_event_available, - .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -233,4 +262,5 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .reset = amd_pmu_reset, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3a29273d070e..d381ad424554 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -739,7 +739,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) BUG_ON(offset == MSR_INVALID); - return !!test_bit(bit_write, &tmp); + return test_bit(bit_write, &tmp); } static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, @@ -2926,9 +2926,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_CR_PAT: - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) - return 1; - vcpu->arch.pat = data; + ret = kvm_set_msr_common(vcpu, msr); + if (ret) + break; + svm->vmcb01.ptr->save.g_pat = data; if (is_guest_mode(vcpu)) nested_vmcb02_compute_g_pat(svm); @@ -3487,7 +3488,7 @@ static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu) if (!is_vnmi_enabled(svm)) return false; - return !!(svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK); + return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK); } static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) @@ -5001,9 +5002,22 @@ static __init void svm_set_cpu_caps(void) boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - /* AMD PMU PERFCTR_CORE CPUID */ - if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + if (enable_pmu) { + /* + * Enumerate support for PERFCTR_CORE if and only if KVM has + * access to enough counters to virtualize "core" support, + * otherwise limit vPMU support to the legacy number of counters. + */ + if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) + kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, + kvm_pmu_cap.num_counters_gp); + else + kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); + + if (kvm_pmu_cap.version != 2 || + !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); + } /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e35cf0bd0df9..ba2ed6d87364 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2649,7 +2649,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -4524,7 +4524,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 741efe2c497b..30ec9ccdea47 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -73,18 +73,6 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) -{ - int bit; - - if (!diff) - return; - - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - set_bit(bit, pmu->reprogram_pmi); - kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); -} - static bool intel_hw_event_available(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -107,17 +95,6 @@ static bool intel_hw_event_available(struct kvm_pmc *pmc) return true; } -/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ -static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (!intel_pmu_has_perf_global_ctrl(pmu)) - return true; - - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -198,11 +175,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - case MSR_CORE_PERF_GLOBAL_STATUS: - case MSR_CORE_PERF_GLOBAL_CTRL: - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - return intel_pmu_has_perf_global_ctrl(pmu); - break; + return kvm_pmu_has_perf_global_ctrl(pmu); case MSR_IA32_PEBS_ENABLE: ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; break; @@ -352,15 +325,6 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_FIXED_CTR_CTRL: msr_info->data = pmu->fixed_ctr_ctrl; break; - case MSR_CORE_PERF_GLOBAL_STATUS: - msr_info->data = pmu->global_status; - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - msr_info->data = pmu->global_ctrl; - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - msr_info->data = 0; - break; case MSR_IA32_PEBS_ENABLE: msr_info->data = pmu->pebs_enable; break; @@ -410,29 +374,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); break; - case MSR_CORE_PERF_GLOBAL_STATUS: - if (!msr_info->host_initiated) - return 1; /* RO MSR */ - - pmu->global_status = data; - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (!kvm_valid_perf_global_ctrl(pmu, data)) - return 1; - - if (pmu->global_ctrl != data) { - diff = pmu->global_ctrl ^ data; - pmu->global_ctrl = data; - reprogram_counters(pmu, diff); - } - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (data & pmu->global_ovf_ctrl_mask) - return 1; - - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - break; case MSR_IA32_PEBS_ENABLE: if (data & pmu->pebs_enable_mask) return 1; @@ -531,7 +472,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; pmu->global_ctrl_mask = ~0ull; - pmu->global_ovf_ctrl_mask = ~0ull; + pmu->global_status_mask = ~0ull; pmu->fixed_ctr_ctrl_mask = ~0ull; pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; @@ -585,11 +526,17 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); pmu->global_ctrl_mask = counter_mask; - pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask + + /* + * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) + * share reserved bit definitions. The kernel just happens to use + * OVF_CTRL for the names. + */ + pmu->global_status_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_ovf_ctrl_mask &= + pmu->global_status_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); @@ -801,7 +748,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) pmc = intel_pmc_idx_to_pmc(pmu, bit); if (!pmc || !pmc_speculative_in_use(pmc) || - !intel_pmc_is_enabled(pmc) || !pmc->perf_event) + !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; /* @@ -816,7 +763,6 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) struct kvm_pmu_ops intel_pmu_ops __initdata = { .hw_event_available = intel_hw_event_available, - .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -831,4 +777,5 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = 1, }; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 44fb619803b8..2d9d155691a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2287,19 +2287,16 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; goto find_uret_msr; case MSR_IA32_CR_PAT: - if (!kvm_pat_valid(data)) - return 1; + ret = kvm_set_msr_common(vcpu, msr_info); + if (ret) + break; if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) get_vmcs12(vcpu)->guest_ia32_pat = data; - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, data); - vcpu->arch.pat = data; - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 9e66531861cf..32384ba38499 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -93,18 +93,6 @@ union vmx_exit_reason { u32 full; }; -static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) -{ - /* - * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is - * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is - * greater than zero. However, KVM only exposes and emulates the MSR - * to/for the guest if the guest PMU supports at least "Architectural - * Performance Monitoring Version 2". - */ - return pmu->version > 1; -} - struct lbr_desc { /* Basic info about guest LBR records. */ struct x86_pmu_lbr records; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0778ca39650..7d6e04450448 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1017,13 +1017,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (static_cpu_has(X86_FEATURE_PKU) && + if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) write_pkru(vcpu->arch.pkru); -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); @@ -1032,15 +1030,13 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return; -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (static_cpu_has(X86_FEATURE_PKU) && + if (cpu_feature_enabled(X86_FEATURE_PKU) && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { @@ -1427,15 +1423,14 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) - * extract the supported MSRs from the related const lists. - * msrs_to_save is selected from the msrs_to_save_all to reflect the - * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs - * may depend on host virtualization features rather than host cpu features. + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds + * MSRs that KVM emulates without strictly requiring host support. + * msr_based_features holds MSRs that enumerate features, i.e. are effectively + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with + * msrs_to_save and emulated_msrs. */ static const u32 msrs_to_save_base[] = { @@ -1483,6 +1478,10 @@ static const u32 msrs_to_save_pmu[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + @@ -1531,11 +1530,11 @@ static const u32 emulated_msrs_all[] = { MSR_IA32_UCODE_REV, /* - * The following list leaves out MSRs whose values are determined - * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. - * We always support the "true" VMX control MSRs, even if the host - * processor does not, so I am putting these registers here rather - * than in msrs_to_save_all. + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, @@ -1631,7 +1630,7 @@ static u64 kvm_get_arch_capabilities(void) * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us - * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that + * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that * capability to the guest too, and if EPT is disabled we're not * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will * require a nested hypervisor to do a flush of its own. @@ -1809,7 +1808,7 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - allowed = !!test_bit(index - start, bitmap); + allowed = test_bit(index - start, bitmap); break; } } @@ -3702,8 +3701,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... MSR_IA32_MC0_CTL2 - 1: - case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: + case MSR_IA32_CR_PAT: + if (!kvm_pat_valid(data)) + return 1; + + vcpu->arch.pat = data; + break; + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -4110,9 +4115,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } + case MSR_IA32_CR_PAT: + msr_info->data = vcpu->arch.pat; + break; case MSR_MTRRcap: - case 0x200 ... MSR_IA32_MC0_CTL2 - 1: - case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -7150,6 +7158,12 @@ static void kvm_probe_msr_to_save(u32 msr_index) kvm_pmu_cap.num_counters_fixed) return; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + return; + break; case MSR_IA32_XFD: case MSR_IA32_XFD_ERR: if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) @@ -10758,6 +10772,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; break; } + + /* Note, VM-Exits that go down the "slow" path are accounted below. */ + ++vcpu->stat.exits; } /* diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c544602d07a3..82e3dafc5453 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -309,7 +309,6 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 4fc5c2de2de4..01c5de4c279b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,6 +7,8 @@ */ #include <linux/linkage.h> +#include <asm/cpufeatures.h> +#include <asm/alternative.h> #include <asm/asm.h> #include <asm/export.h> @@ -29,7 +31,7 @@ */ SYM_FUNC_START(rep_movs_alternative) cmpq $64,%rcx - jae .Lunrolled + jae .Llarge cmp $8,%ecx jae .Lword @@ -65,6 +67,12 @@ SYM_FUNC_START(rep_movs_alternative) _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail) _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail) +.Llarge: +0: ALTERNATIVE "jmp .Lunrolled", "rep movsb", X86_FEATURE_ERMS +1: RET + + _ASM_EXTABLE_UA( 0b, 1b) + .p2align 4 .Lunrolled: 10: movq (%rsi),%r8 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3cdac0f0055d..8192452d1d2d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -9,6 +9,7 @@ #include <linux/sched/task.h> #include <asm/set_memory.h> +#include <asm/cpu_device_id.h> #include <asm/e820/api.h> #include <asm/init.h> #include <asm/page.h> @@ -261,6 +262,24 @@ static void __init probe_page_size_mask(void) } } +#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \ + .family = 6, \ + .model = _model, \ + } +/* + * INVLPG may not properly flush Global entries + * on these CPUs when PCIDs are enabled. + */ +static const struct x86_cpu_id invlpg_miss_ids[] = { + INTEL_MATCH(INTEL_FAM6_ALDERLAKE ), + INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ), + INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ), + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ), + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P), + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S), + {} +}; + static void setup_pcid(void) { if (!IS_ENABLED(CONFIG_X86_64)) @@ -269,6 +288,12 @@ static void setup_pcid(void) if (!boot_cpu_has(X86_FEATURE_PCID)) return; + if (x86_match_cpu(invlpg_miss_ids)) { + pr_info("Incomplete global flushes, disabling PCID"); + setup_clear_cpu_cap(X86_FEATURE_PCID); + return; + } + if (boot_cpu_has(X86_FEATURE_PGE)) { /* * This can't be cr4_set_bits_and_update_boot() -- the diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 557f0fe25dff..37db264866b6 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -172,10 +172,10 @@ void __meminit init_trampoline_kaslr(void) set_p4d(p4d_tramp, __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)); } else { - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); } } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1056bbf55b17..438adb695daa 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2570,7 +2570,7 @@ out_image: } if (bpf_jit_enable > 1) - bpf_jit_dump(prog->len, proglen, pass + 1, image); + bpf_jit_dump(prog->len, proglen, pass + 1, rw_image); if (image) { if (!prog->is_func || extra_pass) { diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 8babce71915f..014c508e914d 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -198,7 +198,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) i++; } kfree(v); - return 0; + return msi_device_populate_sysfs(&dev->dev); error: if (ret == -ENOSYS) @@ -254,7 +254,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) dev_dbg(&dev->dev, "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq); } - return 0; + return msi_device_populate_sysfs(&dev->dev); error: dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n", @@ -346,7 +346,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (ret < 0) goto out; } - ret = 0; + ret = msi_device_populate_sysfs(&dev->dev); out: return ret; } @@ -394,6 +394,8 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) xen_destroy_irq(msidesc->irq + i); msidesc->irq = 0; } + + msi_device_destroy_sysfs(&dev->dev); } static void xen_pv_teardown_msi_irqs(struct pci_dev *dev) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 82fec66d46d2..42abd6af1198 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -14,6 +14,11 @@ $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE CFLAGS_sha256.o := -D__DISABLE_EXPORTS +# When profile-guided optimization is enabled, llvm emits two different +# overlapping text sections, which is not supported by kexec. Remove profile +# optimization flags. +KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%,$(KBUILD_CFLAGS)) + # When linking purgatory.ro with -r unresolved symbols are not checked, # also link a purgatory.chk binary without -r to check for unresolved symbols. PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib |