diff options
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 624 |
1 files changed, 328 insertions, 296 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7eec0226d56a..bcac3efcde41 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -12,6 +12,7 @@ * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/highmem.h> #include <linux/hrtimer.h> @@ -444,36 +445,36 @@ void vmread_error(unsigned long field, bool fault) if (fault) kvm_spurious_fault(); else - vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); + vmx_insn_failed("vmread failed: field=%lx\n", field); } noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", + vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) { - vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", + vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", ext, vpid, gva); } noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) { - vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", + vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", ext, eptp, gpa); } @@ -488,8 +489,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); -struct vmcs_config vmcs_config; -struct vmx_capability vmx_capability; +struct vmcs_config vmcs_config __ro_after_init; +struct vmx_capability vmx_capability __ro_after_init; #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ @@ -523,6 +524,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) +static struct kvm_x86_ops vmx_x86_ops __initdata; + static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -551,6 +554,71 @@ static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) return 0; } +static __init void hv_init_evmcs(void) +{ + int cpu; + + if (!enlightened_vmcs) + return; + + /* + * Enlightened VMCS usage should be recommended and the host needs + * to support eVMCS v1 or above. + */ + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= + KVM_EVMCS_VERSION) { + + /* Check that we have assist pages on all online CPUs */ + for_each_online_cpu(cpu) { + if (!hv_get_vp_assist_page(cpu)) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) { + pr_info("Using Hyper-V Enlightened VMCS\n"); + static_branch_enable(&enable_evmcs); + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) + vmx_x86_ops.enable_l2_tlb_flush + = hv_enable_l2_tlb_flush; + + } else { + enlightened_vmcs = false; + } +} + +static void hv_reset_evmcs(void) +{ + struct hv_vp_assist_page *vp_ap; + + if (!static_branch_unlikely(&enable_evmcs)) + return; + + /* + * KVM should enable eVMCS if and only if all CPUs have a VP assist + * page, and should reject CPU onlining if eVMCS is enabled the CPU + * doesn't have a VP assist page allocated. + */ + vp_ap = hv_get_vp_assist_page(smp_processor_id()); + if (WARN_ON_ONCE(!vp_ap)) + return; + + /* + * Reset everything to support using non-enlightened VMCS access later + * (e.g. when we reload the module with enlightened_vmcs=0) + */ + vp_ap->nested_control.features.directhypercall = 0; + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; +} + +#else /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_init_evmcs(void) {} +static void hv_reset_evmcs(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* @@ -1613,8 +1681,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) if (!instr_len) goto rip_updated; - WARN(exit_reason.enclave_mode, - "KVM: skipping instruction after SGX enclave VM-Exit"); + WARN_ONCE(exit_reason.enclave_mode, + "skipping instruction after SGX enclave VM-Exit"); orig_rip = kvm_rip_read(vcpu); rip = orig_rip + instr_len; @@ -2138,9 +2206,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); } @@ -2448,88 +2514,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static __init int cpu_has_kvm_support(void) -{ - return cpu_has_vmx(); -} - -static __init int vmx_disabled_by_bios(void) -{ - return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !boot_cpu_has(X86_FEATURE_VMX); -} - -static int kvm_cpu_vmxon(u64 vmxon_pointer) -{ - u64 msr; - - cr4_set_bits(X86_CR4_VMXE); - - asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" - _ASM_EXTABLE(1b, %l[fault]) - : : [vmxon_pointer] "m"(vmxon_pointer) - : : fault); - return 0; - -fault: - WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); - cr4_clear_bits(X86_CR4_VMXE); - - return -EFAULT; -} - -static int vmx_hardware_enable(void) -{ - int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - int r; - - if (cr4_read_shadow() & X86_CR4_VMXE) - return -EBUSY; - - /* - * This can happen if we hot-added a CPU but failed to allocate - * VP assist page for it. - */ - if (static_branch_unlikely(&enable_evmcs) && - !hv_get_vp_assist_page(cpu)) - return -EFAULT; - - intel_pt_handle_vmx(1); - - r = kvm_cpu_vmxon(phys_addr); - if (r) { - intel_pt_handle_vmx(0); - return r; - } - - if (enable_ept) - ept_sync_global(); - - return 0; -} - -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - -static void vmx_hardware_disable(void) -{ - vmclear_local_loaded_vmcss(); - - if (cpu_vmxoff()) - kvm_spurious_fault(); - - intel_pt_handle_vmx(0); -} - /* * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID * directly instead of going through cpu_has(), to ensure KVM is trapping @@ -2565,8 +2549,7 @@ static bool cpu_has_perf_global_ctrl_bug(void) return false; } -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) +static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; u32 ctl = ctl_min | ctl_opt; @@ -2584,7 +2567,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; @@ -2593,8 +2576,8 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) return ctl_opt & allowed; } -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, - struct vmx_capability *vmx_cap) +static int setup_vmcs_config(struct vmcs_config *vmcs_conf, + struct vmx_capability *vmx_cap) { u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; @@ -2752,9 +2735,127 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->vmentry_ctrl = _vmentry_control; vmcs_conf->misc = misc_msr; +#if IS_ENABLED(CONFIG_HYPERV) + if (enlightened_vmcs) + evmcs_sanitize_exec_ctrls(vmcs_conf); +#endif + + return 0; +} + +static bool kvm_is_vmx_supported(void) +{ + int cpu = raw_smp_processor_id(); + + if (!cpu_has_vmx()) { + pr_err("VMX not supported by CPU %d\n", cpu); + return false; + } + + if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !this_cpu_has(X86_FEATURE_VMX)) { + pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); + return false; + } + + return true; +} + +static int vmx_check_processor_compat(void) +{ + int cpu = raw_smp_processor_id(); + struct vmcs_config vmcs_conf; + struct vmx_capability vmx_cap; + + if (!kvm_is_vmx_supported()) + return -EIO; + + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { + pr_err("Failed to setup VMCS config on CPU %d\n", cpu); + return -EIO; + } + if (nested) + nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { + pr_err("Inconsistent VMCS config on CPU %d\n", cpu); + return -EIO; + } + return 0; +} + +static int kvm_cpu_vmxon(u64 vmxon_pointer) +{ + u64 msr; + + cr4_set_bits(X86_CR4_VMXE); + + asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; +} + +static int vmx_hardware_enable(void) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + int r; + + if (cr4_read_shadow() & X86_CR4_VMXE) + return -EBUSY; + + /* + * This can happen if we hot-added a CPU but failed to allocate + * VP assist page for it. + */ + if (static_branch_unlikely(&enable_evmcs) && + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + + intel_pt_handle_vmx(1); + + r = kvm_cpu_vmxon(phys_addr); + if (r) { + intel_pt_handle_vmx(0); + return r; + } + + if (enable_ept) + ept_sync_global(); + return 0; } +static void vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v, *n; + + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + __loaded_vmcs_clear(v); +} + +static void vmx_hardware_disable(void) +{ + vmclear_local_loaded_vmcss(); + + if (cpu_vmxoff()) + kvm_spurious_fault(); + + hv_reset_evmcs(); + + intel_pt_handle_vmx(0); +} + struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); @@ -2950,9 +3051,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) var.type = 0x3; var.avl = 0; if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not " - "paragraph aligned when entering " - "protected mode (seg=%d)", seg); + pr_warn_once("segment base is not paragraph aligned " + "when entering protected mode (seg=%d)", seg); } vmcs_write16(sf->selector, var.selector); @@ -2982,8 +3082,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) * vcpu. Warn the user that an update is overdue. */ if (!kvm_vmx->tss_addr) - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " - "called before entering vcpu\n"); + pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n"); vmx_segment_cache_clear(vmx); @@ -3800,39 +3899,6 @@ static void seg_setup(int seg) vmcs_write32(sf->ar_bytes, ar); } -static int alloc_apic_access_page(struct kvm *kvm) -{ - struct page *page; - void __user *hva; - int ret = 0; - - mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_memslot_enabled) - goto out; - hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (IS_ERR(hva)) { - ret = PTR_ERR(hva); - goto out; - } - - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) { - ret = -EFAULT; - goto out; - } - - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); - kvm->arch.apic_access_memslot_enabled = true; -out: - mutex_unlock(&kvm->slots_lock); - return ret; -} - int allocate_vpid(void) { int vpid; @@ -3865,8 +3931,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR * bitmap has changed. */ - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + if (evmcs->hv_enlightenments_control.msr_bitmap) + evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + } vmx->nested.force_msr_bitmap_recalc = true; } @@ -3947,29 +4018,20 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) -{ - unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - unsigned long read_intercept; - int msr; - - read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned int read_idx = msr / BITS_PER_LONG; - unsigned int write_idx = read_idx + (0x800 / sizeof(long)); - - msr_bitmap[read_idx] = read_intercept; - msr_bitmap[write_idx] = ~0ul; - } -} - static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { + /* + * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves + * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, + * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. + */ + const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; + const int write_idx = read_idx + (0x800 / sizeof(u64)); struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; u8 mode; - if (!cpu_has_vmx_msr_bitmap()) + if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) return; if (cpu_has_secondary_exec_ctrls() && @@ -3987,7 +4049,18 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx->x2apic_msr_bitmap_mode = mode; - vmx_reset_x2apic_msrs(vcpu, mode); + /* + * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended + * registers (0x840 and above) intercepted, KVM doesn't support them. + * Intercept all writes by default and poke holes as needed. Pass + * through reads for all valid registers by default in x2APIC+APICv + * mode, only the current timer count needs on-demand emulation by KVM. + */ + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) + msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); + else + msr_bitmap[read_idx] = ~0ull; + msr_bitmap[write_idx] = ~0ull; /* * TPR reads and writes can be virtualized even if virtual interrupt @@ -4519,6 +4592,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + /* + * KVM doesn't support VMFUNC for L1, but the control is set in KVM's + * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. + */ + exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, * in vmx_set_cr4. */ exec_control &= ~SECONDARY_EXEC_DESC; @@ -4535,7 +4614,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) * it needs to be set here when dirty logging is already active, e.g. * if this vCPU was created after dirty logging was enabled. */ - if (!vcpu->kvm->arch.cpu_dirty_logging_count) + if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; if (cpu_has_vmx_xsaves()) { @@ -5099,8 +5178,13 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vect_info = vmx->idt_vectoring_info; intr_info = vmx_get_intr_info(vcpu); + /* + * Machine checks are handled by handle_exception_irqoff(), or by + * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by + * vmx_vcpu_enter_exit(). + */ if (is_machine_check(intr_info) || is_nmi(intr_info)) - return 1; /* handled by handle_exception_nmi_irqoff() */ + return 1; /* * Queue the exception here instead of in handle_nm_fault_irqoff(). @@ -6790,17 +6874,8 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } -void vmx_do_interrupt_nmi_irqoff(unsigned long entry); - -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, - unsigned long entry) -{ - bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; - - kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); - vmx_do_interrupt_nmi_irqoff(entry); - kvm_after_interrupt(vcpu); -} +void vmx_do_interrupt_irqoff(unsigned long entry); +void vmx_do_nmi_irqoff(void); static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) { @@ -6822,9 +6897,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } -static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) +static void handle_exception_irqoff(struct vcpu_vmx *vmx) { - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ @@ -6836,9 +6910,6 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); - /* We need to handle NMIs before interrupts are enabled */ - else if (is_nmi(intr_info)) - handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) @@ -6848,10 +6919,13 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, - "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) + "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); + vmx_do_interrupt_irqoff(gate_offset(desc)); + kvm_after_interrupt(vcpu); + vcpu->arch.at_instruction_boundary = true; } @@ -6865,7 +6939,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) - handle_exception_nmi_irqoff(vmx); + handle_exception_irqoff(vmx); } /* @@ -7100,9 +7174,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) } static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, - struct vcpu_vmx *vmx, - unsigned long flags) + unsigned int flags) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + guest_state_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ @@ -7126,6 +7201,18 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); + if (unlikely(vmx->fail)) + vmx->exit_reason.full = 0xdead; + else + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + + if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && + is_nmi(vmx_get_intr_info(vcpu))) { + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + vmx_do_nmi_irqoff(); + kvm_after_interrupt(vcpu); + } + guest_state_exit_irqoff(); } @@ -7220,7 +7307,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_wait_lapic_expire(vcpu); /* The actual VMENTER/EXIT is in the .noinstr.text section. */ - vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx)); + vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) { @@ -7267,12 +7354,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = 0; - if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); @@ -7386,7 +7470,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) vmx->loaded_vmcs = &vmx->vmcs01; if (cpu_need_virtualize_apic_accesses(vcpu)) { - err = alloc_apic_access_page(vcpu->kvm); + err = kvm_alloc_apic_access_page(vcpu->kvm); if (err) goto free_vmcs; } @@ -7446,29 +7530,6 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static int __init vmx_check_processor_compat(void) -{ - struct vmcs_config vmcs_conf; - struct vmx_capability vmx_cap; - - if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !this_cpu_has(X86_FEATURE_VMX)) { - pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id()); - return -EIO; - } - - if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) - return -EIO; - if (nested) - nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", - smp_processor_id()); - return -EIO; - } - return 0; -} - static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; @@ -7940,17 +8001,20 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (WARN_ON_ONCE(!enable_pml)) + return; + if (is_guest_mode(vcpu)) { vmx->nested.update_vmcs01_cpu_dirty_logging = true; return; } /* - * Note, cpu_dirty_logging_count can be changed concurrent with this + * Note, nr_memslots_dirty_logging can be changed concurrent with this * code, but in that case another update request will be made and so * the guest will never run with a stale PML value. */ - if (vcpu->kvm->arch.cpu_dirty_logging_count) + if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); else secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); @@ -8048,17 +8112,16 @@ static void vmx_hardware_unsetup(void) free_kvm_area(); } -static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) -{ - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_ABSENT) | - BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); - - return supported & BIT(reason); -} +#define VMX_REQUIRED_APICV_INHIBITS \ +( \ + BIT(APICV_INHIBIT_REASON_DISABLE)| \ + BIT(APICV_INHIBIT_REASON_ABSENT) | \ + BIT(APICV_INHIBIT_REASON_HYPERV) | \ + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \ +) static void vmx_vm_destroy(struct kvm *kvm) { @@ -8068,7 +8131,9 @@ static void vmx_vm_destroy(struct kvm *kvm) } static struct kvm_x86_ops vmx_x86_ops __initdata = { - .name = "kvm_intel", + .name = KBUILD_MODNAME, + + .check_processor_compatibility = vmx_check_processor_compat, .hardware_unsetup = vmx_hardware_unsetup, @@ -8142,7 +8207,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, .apicv_post_state_restore = vmx_apicv_post_state_restore, - .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, + .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, @@ -8288,7 +8353,7 @@ static __init int hardware_setup(void) return -EIO; if (cpu_has_perf_global_ctrl_bug()) - pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " "does not work properly. Using workaround\n"); if (boot_cpu_has(X86_FEATURE_NX)) @@ -8296,7 +8361,7 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_MPX)) { rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); - WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } if (!cpu_has_vmx_mpx()) @@ -8315,7 +8380,7 @@ static __init int hardware_setup(void) /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); + pr_err_ratelimited("NX (Execute Disable) not supported\n"); return -EOPNOTSUPP; } @@ -8467,9 +8532,6 @@ static __init int hardware_setup(void) } static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .cpu_has_kvm_support = cpu_has_kvm_support, - .disabled_by_bios = vmx_disabled_by_bios, - .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, .handle_intel_pt_intr = NULL, @@ -8487,41 +8549,23 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void vmx_exit(void) +static void __vmx_exit(void) { + allow_smaller_maxphyaddr = false; + #ifdef CONFIG_KEXEC_CORE RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif + vmx_cleanup_l1d_flush(); +} +static void vmx_exit(void) +{ kvm_exit(); + kvm_x86_vendor_exit(); -#if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) { - int cpu; - struct hv_vp_assist_page *vp_ap; - /* - * Reset everything to support using non-enlightened VMCS - * access later (e.g. when we reload the module with - * enlightened_vmcs=0) - */ - for_each_online_cpu(cpu) { - vp_ap = hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->nested_control.features.directhypercall = 0; - vp_ap->current_nested_vmcs = 0; - vp_ap->enlighten_vmentry = 0; - } - - static_branch_disable(&enable_evmcs); - } -#endif - vmx_cleanup_l1d_flush(); - - allow_smaller_maxphyaddr = false; + __vmx_exit(); } module_exit(vmx_exit); @@ -8529,56 +8573,29 @@ static int __init vmx_init(void) { int r, cpu; -#if IS_ENABLED(CONFIG_HYPERV) + if (!kvm_is_vmx_supported()) + return -EOPNOTSUPP; + /* - * Enlightened VMCS usage should be recommended and the host needs - * to support eVMCS v1 or above. We can also disable eVMCS support - * with module parameter. + * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing + * to unwind if a later step fails. */ - if (enlightened_vmcs && - ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && - (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= - KVM_EVMCS_VERSION) { - - /* Check that we have assist pages on all online CPUs */ - for_each_online_cpu(cpu) { - if (!hv_get_vp_assist_page(cpu)) { - enlightened_vmcs = false; - break; - } - } + hv_init_evmcs(); - if (enlightened_vmcs) { - pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); - static_branch_enable(&enable_evmcs); - } - - if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_l2_tlb_flush - = hv_enable_l2_tlb_flush; - - } else { - enlightened_vmcs = false; - } -#endif - - r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + r = kvm_x86_vendor_init(&vmx_init_ops); if (r) return r; /* - * Must be called after kvm_init() so enable_ept is properly set + * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + if (r) + goto err_l1d_flush; vmx_setup_fb_clear_ctrl(); @@ -8602,6 +8619,21 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), + THIS_MODULE); + if (r) + goto err_kvm_init; + return 0; + +err_kvm_init: + __vmx_exit(); +err_l1d_flush: + kvm_x86_vendor_exit(); + return r; } module_init(vmx_init); |