From 8e39efd840b8d4eae5ab398b43e20ffaff0010cc Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 10 May 2022 15:40:34 -0700 Subject: KVM: VMX: Print VM-instruction error when it may be helpful Include the value of the "VM-instruction error" field from the current VMCS (if any) in the error message for VMCLEAR and VMPTRLD, since each of these instructions may result in more than one VM-instruction error. Previously, this field was only reported for VMWRITE errors. Signed-off-by: David Matlack [Rebased and refactored code; dropped the error number for INVVPID and INVEPT; reworded commit message.] Signed-off-by: Jim Mattson Message-Id: <20220510224035.1792952-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f5aeade623d6..673ba5ca0beb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -392,12 +392,14 @@ noinline void vmwrite_error(unsigned long field, unsigned long value) noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) -- cgit v1.2.3 From cc07e60b0811eeeca769fb342aa6e13da5977657 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 10 May 2022 15:40:35 -0700 Subject: KVM: VMX: Print VM-instruction error as unsigned Change the printf format character from 'd' to 'u' for the VM-instruction error in vmwrite_error(). Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface") Reported-by: Sean Christopherson Signed-off-by: Jim Mattson Message-Id: <20220510224035.1792952-2-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 673ba5ca0beb..e0d3bea73b28 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -386,7 +386,7 @@ asmlinkage void vmread_error(unsigned long field, bool fault) noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } -- cgit v1.2.3 From 0471a7bd1bca2a47a5f378f2222c5cf39ce94152 Mon Sep 17 00:00:00 2001 From: Lev Kujawski Date: Sat, 21 May 2022 08:15:11 +0000 Subject: KVM: set_msr_mce: Permit guests to ignore single-bit ECC errors Certain guest operating systems (e.g., UNIXWARE) clear bit 0 of MC1_CTL to ignore single-bit ECC data errors. Single-bit ECC data errors are always correctable and thus are safe to ignore because they are informational in nature rather than signaling a loss of data integrity. Prior to this patch, these guests would crash upon writing MC1_CTL, with resultant error messages like the following: error: kvm run failed Operation not permitted EAX=fffffffe EBX=fffffffe ECX=00000404 EDX=ffffffff ESI=ffffffff EDI=00000001 EBP=fffdaba4 ESP=fffdab20 EIP=c01333a5 EFL=00000246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA] CS =0100 00000000 ffffffff 00c09b00 DPL=0 CS32 [-RA] SS =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA] DS =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA] FS =0000 00000000 ffffffff 00c00000 GS =0000 00000000 ffffffff 00c00000 LDT=0118 c1026390 00000047 00008200 DPL=0 LDT TR =0110 ffff5af0 00000067 00008b00 DPL=0 TSS32-busy GDT= ffff5020 000002cf IDT= ffff52f0 000007ff CR0=8001003b CR2=00000000 CR3=0100a000 CR4=00000230 DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000 DR6=ffff0ff0 DR7=00000400 EFER=0000000000000000 Code=08 89 01 89 51 04 c3 8b 4c 24 08 8b 01 8b 51 04 8b 4c 24 04 <0f> 30 c3 f7 05 a4 6d ff ff 10 00 00 00 74 03 0f 31 c3 33 c0 33 d2 c3 8d 74 26 00 0f 31 c3 Signed-off-by: Lev Kujawski Message-Id: <20220521081511.187388-1-lkujaw@member.fsf.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b81ef4f497f4..b5aeed18b9f5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3234,10 +3234,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest + * this to avoid an uncatched #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore + * correctable, single-bit ECC data errors. */ if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) return -1; /* MCi_STATUS */ -- cgit v1.2.3 From 00f08d99dd7d3ab3d8cb1fa356e857fcccbbdde8 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 2 May 2022 00:07:25 +0200 Subject: KVM: nSVM: Sync next_rip field from vmcb12 to vmcb02 The next_rip field of a VMCB is *not* an output-only field for a VMRUN. This field value (instead of the saved guest RIP) in used by the CPU for the return address pushed on stack when injecting a software interrupt or INT3 or INTO exception. Make sure this field gets synced from vmcb12 to vmcb02 when entering L2 or loading a nested state and NRIPS is exposed to L1. If NRIPS is supported in hardware but not exposed to L1 (nrips=0 or hidden by userspace), stuff vmcb02's next_rip from the new L2 RIP to emulate a !NRIPS CPU (which saves RIP on the stack as-is). Reviewed-by: Maxim Levitsky Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 22 +++++++++++++++++++--- arch/x86/kvm/svm/svm.h | 1 + 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3361258640a2..e8aa95a74564 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -371,6 +371,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->nested_ctl = from->nested_ctl; to->event_inj = from->event_inj; to->event_inj_err = from->event_inj_err; + to->next_rip = from->next_rip; to->nested_cr3 = from->nested_cr3; to->virt_ext = from->virt_ext; to->pause_filter_count = from->pause_filter_count; @@ -608,7 +609,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 } } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, + unsigned long vmcb12_rip) { u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; @@ -662,6 +664,19 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.event_inj = svm->nested.ctl.event_inj; vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; + /* + * next_rip is consumed on VMRUN as the return address pushed on the + * stack for injected soft exceptions/interrupts. If nrips is exposed + * to L1, take it verbatim from vmcb12. If nrips is supported in + * hardware but not exposed to L1, stuff the actual L2 RIP to emulate + * what a nrips=0 CPU would do (L1 is responsible for advancing RIP + * prior to injecting the event). + */ + if (svm->nrips_enabled) + vmcb02->control.next_rip = svm->nested.ctl.next_rip; + else if (boot_cpu_has(X86_FEATURE_NRIPS)) + vmcb02->control.next_rip = vmcb12_rip; + vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; if (svm->lbrv_enabled) @@ -745,7 +760,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, vmcb12->save.rip); nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, @@ -1418,6 +1433,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->nested_ctl = from->nested_ctl; dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; + dst->next_rip = from->next_rip; dst->nested_cr3 = from->nested_cr3; dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; @@ -1602,7 +1618,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip); /* * While the nested guest CR3 is already checked and set by diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 500348c1cb35..de076d658390 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -139,6 +139,7 @@ struct vmcb_ctrl_area_cached { u64 nested_ctl; u32 event_inj; u32 event_inj_err; + u64 next_rip; u64 nested_cr3; u64 virt_ext; u32 clean; -- cgit v1.2.3 From f17c31c48e5cde9895a491d91c424eeeada3e134 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 2 May 2022 00:07:26 +0200 Subject: KVM: SVM: Don't BUG if userspace injects an interrupt with GIF=0 Don't BUG/WARN on interrupt injection due to GIF being cleared, since it's trivial for userspace to force the situation via KVM_SET_VCPU_EVENTS (even if having at least a WARN there would be correct for KVM internally generated injections). kernel BUG at arch/x86/kvm/svm/svm.c:3386! invalid opcode: 0000 [#1] SMP CPU: 15 PID: 926 Comm: smm_test Not tainted 5.17.0-rc3+ #264 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:svm_inject_irq+0xab/0xb0 [kvm_amd] Code: <0f> 0b 0f 1f 00 0f 1f 44 00 00 80 3d ac b3 01 00 00 55 48 89 f5 53 RSP: 0018:ffffc90000b37d88 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88810a234ac0 RCX: 0000000000000006 RDX: 0000000000000000 RSI: ffffc90000b37df7 RDI: ffff88810a234ac0 RBP: ffffc90000b37df7 R08: ffff88810a1fa410 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff888109571000 R14: ffff88810a234ac0 R15: 0000000000000000 FS: 0000000001821380(0000) GS:ffff88846fdc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f74fc550008 CR3: 000000010a6fe000 CR4: 0000000000350ea0 Call Trace: inject_pending_event+0x2f7/0x4c0 [kvm] kvm_arch_vcpu_ioctl_run+0x791/0x17a0 [kvm] kvm_vcpu_ioctl+0x26d/0x650 [kvm] __x64_sys_ioctl+0x82/0xb0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 219b65dcf6c0 ("KVM: SVM: Improve nested interrupt injection") Cc: stable@vger.kernel.org Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: <35426af6e123cbe91ec7ce5132ce72521f02b1b5.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2628452a5156..a191ec138636 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3392,8 +3392,6 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - BUG_ON(!(gif_set(svm))); - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); ++vcpu->stat.irq_injections; -- cgit v1.2.3 From cd9e6da8048c5b40315ee2d929b6230ce1252c3c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:27 +0200 Subject: KVM: SVM: Unwind "speculative" RIP advancement if INTn injection "fails" Unwind the RIP advancement done by svm_queue_exception() when injecting an INT3 ultimately "fails" due to the CPU encountering a VM-Exit while vectoring the injected event, even if the exception reported by the CPU isn't the same event that was injected. If vectoring INT3 encounters an exception, e.g. #NP, and vectoring the #NP encounters an intercepted exception, e.g. #PF when KVM is using shadow paging, then the #NP will be reported as the event that was in-progress. Note, this is still imperfect, as it will get a false positive if the INT3 is cleanly injected, no VM-Exit occurs before the IRET from the INT3 handler in the guest, the instruction following the INT3 generates an exception (directly or indirectly), _and_ vectoring that exception encounters an exception that is intercepted by KVM. The false positives could theoretically be solved by further analyzing the vectoring event, e.g. by comparing the error code against the expected error code were an exception to occur when vectoring the original injected exception, but SVM without NRIPS is a complete disaster, trying to make it 100% correct is a waste of time. Reviewed-by: Maxim Levitsky Fixes: 66b7138f9136 ("KVM: SVM: Emulate nRIP feature when reinjecting INT3") Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: <450133cf0a026cb9825a2ff55d02cb136a1cb111.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a191ec138636..c1506069e67c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3706,6 +3706,18 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + /* + * If NextRIP isn't enabled, KVM must manually advance RIP prior to + * injecting the soft exception/interrupt. That advancement needs to + * be unwound if vectoring didn't complete. Note, the _new_ event may + * not be the injected event, e.g. if KVM injected an INTn, the INTn + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will + * be the reported vectored event, but RIP still needs to be unwound. + */ + if (int3_injected && type == SVM_EXITINTINFO_TYPE_EXEPT && + kvm_is_linear_rip(vcpu, svm->int3_rip)) + kvm_rip_write(vcpu, kvm_rip_read(vcpu) - int3_injected); + switch (type) { case SVM_EXITINTINFO_TYPE_NMI: vcpu->arch.nmi_injected = true; @@ -3719,16 +3731,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) /* * In case of software exceptions, do not reinject the vector, - * but re-execute the instruction instead. Rewind RIP first - * if we emulated INT3 before. + * but re-execute the instruction instead. */ - if (kvm_exception_is_soft(vector)) { - if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(vcpu, svm->int3_rip)) - kvm_rip_write(vcpu, - kvm_rip_read(vcpu) - int3_injected); + if (kvm_exception_is_soft(vector)) break; - } + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; kvm_requeue_exception_e(vcpu, vector, err); -- cgit v1.2.3 From 3741aec4c38fa4123ab08ae552f05366d4fd05d8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:28 +0200 Subject: KVM: SVM: Stuff next_rip on emulated INT3 injection if NRIPS is supported If NRIPS is supported in hardware but disabled in KVM, set next_rip to the next RIP when advancing RIP as part of emulating INT3 injection. There is no flag to tell the CPU that KVM isn't using next_rip, and so leaving next_rip is left as is will result in the CPU pushing garbage onto the stack when vectoring the injected event. Reviewed-by: Maxim Levitsky Fixes: 66b7138f9136 ("KVM: SVM: Emulate nRIP feature when reinjecting INT3") Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c1506069e67c..8c3c6bba6ccd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -392,6 +392,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) */ (void)svm_skip_emulated_instruction(vcpu); rip = kvm_rip_read(vcpu); + + if (boot_cpu_has(X86_FEATURE_NRIPS)) + svm->vmcb->control.next_rip = rip; + svm->int3_rip = rip + svm->vmcb->save.cs.base; svm->int3_injected = rip - old_rip; } @@ -3709,7 +3713,7 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) /* * If NextRIP isn't enabled, KVM must manually advance RIP prior to * injecting the soft exception/interrupt. That advancement needs to - * be unwound if vectoring didn't complete. Note, the _new_ event may + * be unwound if vectoring didn't complete. Note, the new event may * not be the injected event, e.g. if KVM injected an INTn, the INTn * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will * be the reported vectored event, but RIP still needs to be unwound. -- cgit v1.2.3 From 6ef88d6e36c2b4b3886ec9967cafabe4424d27d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:29 +0200 Subject: KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction Re-inject INT3/INTO instead of retrying the instruction if the CPU encountered an intercepted exception while vectoring the software exception, e.g. if vectoring INT3 encounters a #PF and KVM is using shadow paging. Retrying the instruction is architecturally wrong, e.g. will result in a spurious #DB if there's a code breakpoint on the INT3/O, and lack of re-injection also breaks nested virtualization, e.g. if L1 injects a software exception and vectoring the injected exception encounters an exception that is intercepted by L0 but not L1. Due to, ahem, deficiencies in the SVM architecture, acquiring the next RIP may require flowing through the emulator even if NRIPS is supported, as the CPU clears next_rip if the VM-Exit is due to an exception other than "exceptions caused by the INT3, INTO, and BOUND instructions". To deal with this, "skip" the instruction to calculate next_rip (if it's not already known), and then unwind the RIP write and any side effects (RFLAGS updates). Save the computed next_rip and use it to re-stuff next_rip if injection doesn't complete. This allows KVM to do the right thing if next_rip was known prior to injection, e.g. if L1 injects a soft event into L2, and there is no backing INTn instruction, e.g. if L1 is injecting an arbitrary event. Note, it's impossible to guarantee architectural correctness given SVM's architectural flaws. E.g. if the guest executes INTn (no KVM injection), an exit occurs while vectoring the INTn, and the guest modifies the code stream while the exit is being handled, KVM will compute the incorrect next_rip due to "skipping" the wrong instruction. A future enhancement to make this less awful would be for KVM to detect that the decoded instruction is not the correct INTn and drop the to-be-injected soft event (retrying is a lesser evil compared to shoving the wrong RIP on the exception stack). Reported-by: Maciej S. Szmigiero Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: <65cb88deab40bc1649d509194864312a89bbe02e.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 26 +++++++++ arch/x86/kvm/svm/svm.c | 140 ++++++++++++++++++++++++++++++++-------------- arch/x86/kvm/svm/svm.h | 6 +- 3 files changed, 129 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e8aa95a74564..525117a49c18 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -609,6 +609,21 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 } } +static inline bool is_evtinj_soft(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + /* + * Intentionally return false for SOFT events, SVM doesn't yet support + * re-injecting soft interrupts. + */ + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); +} + static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, unsigned long vmcb12_rip) { @@ -677,6 +692,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; + if (is_evtinj_soft(vmcb02->control.event_inj)) { + svm->soft_int_injected = true; + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = vmcb12_rip; + if (svm->nrips_enabled) + svm->soft_int_next_rip = svm->nested.ctl.next_rip; + else + svm->soft_int_next_rip = vmcb12_rip; + } + vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; if (svm->lbrv_enabled) @@ -849,6 +874,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) out_exit_err: svm->nested.nested_run_pending = 0; + svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; svm->vmcb->control.exit_code_hi = 0; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8c3c6bba6ccd..ec37647ca068 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -342,9 +342,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned long old_rflags; /* * SEV-ES does not expose the next RIP. The RIP update is controlled by @@ -359,18 +361,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { + if (unlikely(!commit_side_effects)) + old_rflags = svm->vmcb->save.rflags; + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; + + if (unlikely(!commit_side_effects)) + svm->vmcb->save.rflags = old_rflags; } else { kvm_rip_write(vcpu, svm->next_rip); } done: - svm_set_interrupt_shadow(vcpu, 0); + if (likely(commit_side_effects)) + svm_set_interrupt_shadow(vcpu, 0); return 1; } +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + return __svm_skip_emulated_instruction(vcpu, true); +} + +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +{ + unsigned long rip, old_rip = kvm_rip_read(vcpu); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * Due to architectural shortcomings, the CPU doesn't always provide + * NextRIP, e.g. if KVM intercepted an exception that occurred while + * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip + * the instruction even if NextRIP is supported to acquire the next + * RIP so that it can be shoved into the NextRIP field, otherwise + * hardware will fail to advance guest RIP during event injection. + * Drop the exception/interrupt if emulation fails and effectively + * retry the instruction, it's the least awful option. If NRIPS is + * in use, the skip must not commit any side effects such as clearing + * the interrupt shadow or RFLAGS.RF. + */ + if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + return -EIO; + + rip = kvm_rip_read(vcpu); + + /* + * Save the injection information, even when using next_rip, as the + * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection + * doesn't complete due to a VM-Exit occurring while the CPU is + * vectoring the event. Decoding the instruction isn't guaranteed to + * work as there may be no backing instruction, e.g. if the event is + * being injected by L1 for L2, or if the guest is patching INT3 into + * a different instruction. + */ + svm->soft_int_injected = true; + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = old_rip; + svm->soft_int_next_rip = rip; + + if (nrips) + kvm_rip_write(vcpu, old_rip); + + if (static_cpu_has(X86_FEATURE_NRIPS)) + svm->vmcb->control.next_rip = rip; + + return 0; +} + static void svm_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -380,25 +439,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu); - if (nr == BP_VECTOR && !nrips) { - unsigned long rip, old_rip = kvm_rip_read(vcpu); - - /* - * For guest debugging where we have to reinject #BP if some - * INT3 is guest-owned: - * Emulate nRIP by moving RIP forward. Will fail if injection - * raises a fault that is not intercepted. Still better than - * failing in all cases. - */ - (void)svm_skip_emulated_instruction(vcpu); - rip = kvm_rip_read(vcpu); - - if (boot_cpu_has(X86_FEATURE_NRIPS)) - svm->vmcb->control.next_rip = rip; - - svm->int3_rip = rip + svm->vmcb->save.cs.base; - svm->int3_injected = rip - old_rip; - } + if (kvm_exception_is_soft(nr) && + svm_update_soft_interrupt_rip(vcpu)) + return; svm->vmcb->control.event_inj = nr | SVM_EVTINJ_VALID @@ -3677,15 +3720,46 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, + int type) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's + * associated with the original soft exception/interrupt. next_rip is + * cleared on all exits that can occur while vectoring an event, so KVM + * needs to manually set next_rip for re-injection. Unlike the !nrips + * case below, this needs to be done if and only if KVM is re-injecting + * the same event, i.e. if the event is a soft exception/interrupt, + * otherwise next_rip is unused on VMRUN. + */ + if (nrips && type == SVM_EXITINTINFO_TYPE_EXEPT && + kvm_exception_is_soft(vector) && + kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) + svm->vmcb->control.next_rip = svm->soft_int_next_rip; + /* + * If NRIPS isn't enabled, KVM must manually advance RIP prior to + * injecting the soft exception/interrupt. That advancement needs to + * be unwound if vectoring didn't complete. Note, the new event may + * not be the injected event, e.g. if KVM injected an INTn, the INTn + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will + * be the reported vectored event, but RIP still needs to be unwound. + */ + else if (!nrips && type == SVM_EXITINTINFO_TYPE_EXEPT && + kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) + kvm_rip_write(vcpu, svm->soft_int_old_rip); +} + static void svm_complete_interrupts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; - unsigned int3_injected = svm->int3_injected; + bool soft_int_injected = svm->soft_int_injected; - svm->int3_injected = 0; + svm->soft_int_injected = false; /* * If we've made progress since setting HF_IRET_MASK, we've @@ -3710,17 +3784,8 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; - /* - * If NextRIP isn't enabled, KVM must manually advance RIP prior to - * injecting the soft exception/interrupt. That advancement needs to - * be unwound if vectoring didn't complete. Note, the new event may - * not be the injected event, e.g. if KVM injected an INTn, the INTn - * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will - * be the reported vectored event, but RIP still needs to be unwound. - */ - if (int3_injected && type == SVM_EXITINTINFO_TYPE_EXEPT && - kvm_is_linear_rip(vcpu, svm->int3_rip)) - kvm_rip_write(vcpu, kvm_rip_read(vcpu) - int3_injected); + if (soft_int_injected) + svm_complete_soft_interrupt(vcpu, vector, type); switch (type) { case SVM_EXITINTINFO_TYPE_NMI: @@ -3733,13 +3798,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) if (vector == X86_TRAP_VC) break; - /* - * In case of software exceptions, do not reinject the vector, - * but re-execute the instruction instead. - */ - if (kvm_exception_is_soft(vector)) - break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; kvm_requeue_exception_e(vcpu, vector, err); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index de076d658390..f0b6111ee5b1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -230,8 +230,10 @@ struct vcpu_svm { bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; - unsigned int3_injected; - unsigned long int3_rip; + unsigned long soft_int_csbase; + unsigned long soft_int_old_rip; + unsigned long soft_int_next_rip; + bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ bool nrips_enabled : 1; -- cgit v1.2.3 From 7e5b5ef8dca3229a5226eabf53bdc7b67ebd07ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:30 +0200 Subject: KVM: SVM: Re-inject INTn instead of retrying the insn on "failure" Re-inject INTn software interrupts instead of retrying the instruction if the CPU encountered an intercepted exception while vectoring the INTn, e.g. if KVM intercepted a #PF when utilizing shadow paging. Retrying the instruction is architecturally wrong e.g. will result in a spurious #DB if there's a code breakpoint on the INT3/O, and lack of re-injection also breaks nested virtualization, e.g. if L1 injects a software interrupt and vectoring the injected interrupt encounters an exception that is intercepted by L0 but not L1. Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: <1654ad502f860948e4f2d57b8bd881d67301f785.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 7 +++---- arch/x86/kvm/svm/svm.c | 23 +++++++++++++++++++---- 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 525117a49c18..0d25dea40796 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -617,10 +617,9 @@ static inline bool is_evtinj_soft(u32 evtinj) if (!(evtinj & SVM_EVTINJ_VALID)) return false; - /* - * Intentionally return false for SOFT events, SVM doesn't yet support - * re-injecting soft interrupts. - */ + if (type == SVM_EVTINJ_TYPE_SOFT) + return true; + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ec37647ca068..6248967d4a77 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3438,12 +3438,22 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) static void svm_inject_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + u32 type; + + if (vcpu->arch.interrupt.soft) { + if (svm_update_soft_interrupt_rip(vcpu)) + return; + + type = SVM_EVTINJ_TYPE_SOFT; + } else { + type = SVM_EVTINJ_TYPE_INTR; + } trace_kvm_inj_virq(vcpu->arch.interrupt.nr); ++vcpu->stat.irq_injections; svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; + SVM_EVTINJ_VALID | type; } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, @@ -3723,6 +3733,8 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, int type) { + bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); + bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); struct vcpu_svm *svm = to_svm(vcpu); /* @@ -3734,8 +3746,7 @@ static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, * the same event, i.e. if the event is a soft exception/interrupt, * otherwise next_rip is unused on VMRUN. */ - if (nrips && type == SVM_EXITINTINFO_TYPE_EXEPT && - kvm_exception_is_soft(vector) && + if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) svm->vmcb->control.next_rip = svm->soft_int_next_rip; /* @@ -3746,7 +3757,7 @@ static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will * be the reported vectored event, but RIP still needs to be unwound. */ - else if (!nrips && type == SVM_EXITINTINFO_TYPE_EXEPT && + else if (!nrips && (is_soft || is_exception) && kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) kvm_rip_write(vcpu, svm->soft_int_old_rip); } @@ -3808,9 +3819,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; + case SVM_EXITINTINFO_TYPE_SOFT: + kvm_queue_interrupt(vcpu, vector, true); + break; default: break; } + } static void svm_cancel_injection(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From a61d7c5432ac5a953bbcec17af031661c2bd201d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:31 +0200 Subject: KVM: x86: Trace re-injected exceptions Trace exceptions that are re-injected, not just those that KVM is injecting for the first time. Debugging re-injection bugs is painful enough as is, not having visibility into what KVM is doing only makes things worse. Delay propagating pending=>injected in the non-reinjection path so that the tracing can properly identify reinjected exceptions. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Maciej S. Szmigiero Message-Id: <25470690a38b4d2b32b6204875dd35676c65c9f2.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 12 ++++++++---- arch/x86/kvm/x86.c | 16 +++++++++------- 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index de4762517569..d07428e660e3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -358,25 +358,29 @@ TRACE_EVENT(kvm_inj_virq, * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_exception, - TP_PROTO(unsigned exception, bool has_error, unsigned error_code), - TP_ARGS(exception, has_error, error_code), + TP_PROTO(unsigned exception, bool has_error, unsigned error_code, + bool reinjected), + TP_ARGS(exception, has_error, error_code, reinjected), TP_STRUCT__entry( __field( u8, exception ) __field( u8, has_error ) __field( u32, error_code ) + __field( bool, reinjected ) ), TP_fast_assign( __entry->exception = exception; __entry->has_error = has_error; __entry->error_code = error_code; + __entry->reinjected = reinjected; ), - TP_printk("%s (0x%x)", + TP_printk("%s (0x%x)%s", __print_symbolic(__entry->exception, kvm_trace_sym_exc), /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0) + __entry->has_error ? __entry->error_code : 0, + __entry->reinjected ? " [reinjected]" : "") ); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 921b1139c303..c9f3ad89bf4c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9408,6 +9408,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.injected); + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) vcpu->arch.exception.error_code = false; static_call(kvm_x86_queue_exception)(vcpu); @@ -9465,13 +9470,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) /* try to inject new event if pending */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - - vcpu->arch.exception.pending = false; - vcpu->arch.exception.injected = true; - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); @@ -9485,6 +9483,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) } kvm_inject_exception(vcpu); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + can_inject = false; } -- cgit v1.2.3 From 21d4c575eb4a1e6d956b61b5e9c162895fa7d4ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:32 +0200 Subject: KVM: x86: Print error code in exception injection tracepoint iff valid Print the error code in the exception injection tracepoint if and only if the exception has an error code. Define the entire error code sequence as a set of formatted strings, print empty strings if there's no error code, and abuse __print_symbolic() by passing it an empty array to coerce it into printing the error code as a hex string. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Maciej S. Szmigiero Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index d07428e660e3..385436d12024 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -376,10 +376,11 @@ TRACE_EVENT(kvm_inj_exception, __entry->reinjected = reinjected; ), - TP_printk("%s (0x%x)%s", + TP_printk("%s%s%s%s%s", __print_symbolic(__entry->exception, kvm_trace_sym_exc), - /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0, + !__entry->has_error ? "" : " (", + !__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }), + !__entry->has_error ? "" : ")", __entry->reinjected ? " [reinjected]" : "") ); -- cgit v1.2.3 From 2d61391270a3ceb95b3dd536ea13002e653323b6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:33 +0200 Subject: KVM: x86: Differentiate Soft vs. Hard IRQs vs. reinjected in tracepoint In the IRQ injection tracepoint, differentiate between Hard IRQs and Soft "IRQs", i.e. interrupts that are reinjected after incomplete delivery of a software interrupt from an INTn instruction. Tag reinjected interrupts as such, even though the information is usually redundant since soft interrupts are only ever reinjected by KVM. Though rare in practice, a hard IRQ can be reinjected. Signed-off-by: Sean Christopherson [MSS: change "kvm_inj_virq" event "reinjected" field type to bool] Signed-off-by: Maciej S. Szmigiero Message-Id: <9664d49b3bd21e227caa501cff77b0569bebffe2.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 5 +++-- arch/x86/kvm/trace.h | 16 +++++++++++----- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- 5 files changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 959d66b9be94..8109805b5429 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1405,7 +1405,7 @@ struct kvm_x86_ops { u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*inject_irq)(struct kvm_vcpu *vcpu); + void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected); void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6248967d4a77..0f4d38e5ceab 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3435,7 +3435,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; } -static void svm_inject_irq(struct kvm_vcpu *vcpu) +static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_svm *svm = to_svm(vcpu); u32 type; @@ -3449,7 +3449,8 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu) type = SVM_EVTINJ_TYPE_INTR; } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr, + vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 385436d12024..fd28dd40b813 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -333,18 +333,24 @@ TRACE_EVENT_KVM_EXIT(kvm_exit); * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_virq, - TP_PROTO(unsigned int irq), - TP_ARGS(irq), + TP_PROTO(unsigned int vector, bool soft, bool reinjected), + TP_ARGS(vector, soft, reinjected), TP_STRUCT__entry( - __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( bool, soft ) + __field( bool, reinjected ) ), TP_fast_assign( - __entry->irq = irq; + __entry->vector = vector; + __entry->soft = soft; + __entry->reinjected = reinjected; ), - TP_printk("irq %u", __entry->irq) + TP_printk("%s 0x%x%s", + __entry->soft ? "Soft/INTn" : "IRQ", __entry->vector, + __entry->reinjected ? " [reinjected]" : "") ); #define EXS(x) { x##_VECTOR, "#" #x } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e1aa14743cdb..9714ae95589f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4573,13 +4573,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu) +static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; int irq = vcpu->arch.interrupt.nr; - trace_kvm_inj_virq(irq); + trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9f3ad89bf4c..501606e02688 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9448,7 +9448,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, true); can_inject = false; } } @@ -9539,7 +9539,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, false); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) -- cgit v1.2.3 From 159fc6fa3b7db24db85598115cc43dc47196919e Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 2 May 2022 00:07:34 +0200 Subject: KVM: nSVM: Transparently handle L1 -> L2 NMI re-injection A NMI that L1 wants to inject into its L2 should be directly re-injected, without causing L0 side effects like engaging NMI blocking for L1. It's also worth noting that in this case it is L1 responsibility to track the NMI window status for its L2 guest. Signed-off-by: Maciej S. Szmigiero Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 12 ++++++++++++ arch/x86/kvm/svm/svm.c | 7 +++++++ arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 0d25dea40796..688f86b9202a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -623,6 +623,16 @@ static inline bool is_evtinj_soft(u32 evtinj) return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); } +static bool is_evtinj_nmi(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + return type == SVM_EVTINJ_TYPE_NMI; +} + static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, unsigned long vmcb12_rip) { @@ -691,6 +701,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; + svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); if (is_evtinj_soft(vmcb02->control.event_inj)) { svm->soft_int_injected = true; svm->soft_int_csbase = svm->vmcb->save.cs.base; @@ -873,6 +884,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) out_exit_err: svm->nested.nested_run_pending = 0; + svm->nmi_l1_to_l2 = false; svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0f4d38e5ceab..5fda7e7102f2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3429,6 +3429,10 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + + if (svm->nmi_l1_to_l2) + return; + vcpu->arch.hflags |= HF_NMI_MASK; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); @@ -3769,8 +3773,10 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; + bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; bool soft_int_injected = svm->soft_int_injected; + svm->nmi_l1_to_l2 = false; svm->soft_int_injected = false; /* @@ -3802,6 +3808,7 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) switch (type) { case SVM_EXITINTINFO_TYPE_NMI: vcpu->arch.nmi_injected = true; + svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; case SVM_EXITINTINFO_TYPE_EXEPT: /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f0b6111ee5b1..24b5c73a8c87 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -229,6 +229,7 @@ struct vcpu_svm { bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; + bool nmi_l1_to_l2; unsigned long soft_int_csbase; unsigned long soft_int_old_rip; -- cgit v1.2.3 From 9fb3565743d58352f00964bf47213b88aff4bb82 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 13 May 2022 19:49:59 +0000 Subject: KVM: x86/mmu: Drop RWX=0 SPTEs during ept_sync_page() All of sync_page()'s existing checks filter out only !PRESENT gPTE, because without execute-only, all upper levels are guaranteed to be at least READABLE. However, if EPT with execute-only support is in use by L1, KVM can create an SPTE that is shadow-present but guest-inaccessible (RWX=0) if the upper level combined permissions are R (or RW) and the leaf EPTE is changed from R (or RW) to X. Because the EPTE is considered present when viewed in isolation, and no reserved bits are set, FNAME(prefetch_invalid_gpte) will consider the GPTE valid, and cause a not-present SPTE to be created. The SPTE is "correct": the guest translation is inaccessible because the combined protections of all levels yield RWX=0, and KVM will just redirect any vmexits to the guest. If EPT A/D bits are disabled, KVM can mistake the SPTE for an access-tracked SPTE, but again such confusion isn't fatal, as the "saved" protections are also RWX=0. However, creating a useless SPTE in general means that KVM messed up something, even if this particular goof didn't manifest as a functional bug. So, drop SPTEs whose new protections will yield a RWX=0 SPTE, and add a WARN in make_spte() to detect creation of SPTEs that will result in RWX=0 protections. Fixes: d95c55687e11 ("kvm: mmu: track read permission explicitly for shadow EPT page tables") Cc: David Matlack Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20220513195000.99371-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++++++- arch/x86/kvm/mmu/spte.c | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index db80f7ccaa4e..1576e65b3b1f 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -1053,7 +1053,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) continue; - if (gfn != sp->gfns[i]) { + /* + * Drop the SPTE if the new protections would result in a RWX=0 + * SPTE or if the gfn is changing. The RWX=0 case only affects + * EPT with execute-only support, i.e. EPT without an effective + * "present" bit, as all other paging modes will create a + * read-only SPTE if pte_access is zero. + */ + if ((!pte_access && !shadow_present_mask) || gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); flush = true; continue; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index b5960bbde7f7..cda1851ec155 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -129,6 +129,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; + WARN_ON_ONCE(!pte_access && !shadow_present_mask); + if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_mmu_page_ad_need_write_protect(sp)) -- cgit v1.2.3 From b8b9156ec6ef69baa487185205f2be833267776b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 13 May 2022 19:50:00 +0000 Subject: KVM: x86/mmu: Comment FNAME(sync_page) to document TLB flushing logic Add a comment to FNAME(sync_page) to explain why the TLB flushing logic conspiculously doesn't handle the scenario of guest protections being reduced. Specifically, if synchronizing a SPTE drops execute protections, KVM will not emit a TLB flush, whereas dropping writable or clearing A/D bits does trigger a flush via mmu_spte_update(). Architecturally, until the GPTE is implicitly or explicitly flushed from the guest's perspective, KVM is not required to flush any old, stale translations. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Message-Id: <20220513195000.99371-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 1576e65b3b1f..fe35d8fd3276 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -1077,6 +1077,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) flush |= mmu_spte_update(sptep, spte); } + /* + * Note, any flush is purely for KVM's correctness, e.g. when dropping + * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier + * unmap or dirty logging event doesn't fail to flush. The guest is + * responsible for flushing the TLB to ensure any changes in protection + * bits are recognized, i.e. until the guest flushes or page faults on + * a relevant address, KVM is architecturally allowed to let vCPUs use + * cached translations with the old protection bits. + */ return flush; } -- cgit v1.2.3 From 465932db25f3664893b66152c7b190afd28c32db Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:32:40 +0800 Subject: x86/cpu: Add new VMX feature, Tertiary VM-Execution control A new 64-bit control field "tertiary processor-based VM-execution controls", is defined [1]. It's controlled by bit 17 of the primary processor-based VM-execution controls. Different from its brother VM-execution fields, this tertiary VM- execution controls field is 64 bit. So it occupies 2 vmx_feature_leafs, TERTIARY_CTLS_LOW and TERTIARY_CTLS_HIGH. Its companion VMX capability reporting MSR,MSR_IA32_VMX_PROCBASED_CTLS3 (0x492), is also semantically different from its brothers, whose 64 bits consist of all allow-1, rather than 32-bit allow-0 and 32-bit allow-1 [1][2]. Therefore, its init_vmx_capabilities() is a little different from others. [1] ISE 6.2 "VMCS Changes" https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html [2] SDM Vol3. Appendix A.3 Reviewed-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153240.11549-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/vmxfeatures.h | 3 ++- arch/x86/kernel/cpu/feat_ctl.c | 9 ++++++++- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 403e83b4adc8..c194995b2e1f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -980,6 +980,7 @@ #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 #define MSR_IA32_VMX_VMFUNC 0x00000491 +#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index d9a74681a77d..ff20776dc83b 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -5,7 +5,7 @@ /* * Defines VMX CPU feature bits */ -#define NVMXINTS 3 /* N 32-bit words worth of info */ +#define NVMXINTS 5 /* N 32-bit words worth of info */ /* * Note: If the comment begins with a quoted string, that string is used @@ -43,6 +43,7 @@ #define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */ #define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */ #define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */ +#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* "" Enable Tertiary VM-Execution Controls */ #define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */ #define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */ #define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */ diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index da696eb4821a..993697e71854 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -15,6 +15,8 @@ enum vmx_feature_leafs { MISC_FEATURES = 0, PRIMARY_CTLS, SECONDARY_CTLS, + TERTIARY_CTLS_LOW, + TERTIARY_CTLS_HIGH, NR_VMX_FEATURE_WORDS, }; @@ -22,7 +24,7 @@ enum vmx_feature_leafs { static void init_vmx_capabilities(struct cpuinfo_x86 *c) { - u32 supported, funcs, ept, vpid, ign; + u32 supported, funcs, ept, vpid, ign, low, high; BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); @@ -42,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); c->vmx_capability[SECONDARY_CTLS] = supported; + /* All 64 bits of tertiary controls MSR are allowed-1 settings. */ + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high); + c->vmx_capability[TERTIARY_CTLS_LOW] = low; + c->vmx_capability[TERTIARY_CTLS_HIGH] = high; + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); -- cgit v1.2.3 From ed3905ba60384ab8c73b421c3618375e58080a9a Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:33:18 +0800 Subject: KVM: VMX: Extend BUILD_CONTROLS_SHADOW macro to support 64-bit variation The Tertiary VM-Exec Control, different from previous control fields, is 64 bit. So extend BUILD_CONTROLS_SHADOW() by adding a 'bit' parameter, to support both 32 bit and 64 bit fields' auxiliary functions building. Suggested-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153318.11595-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 56 +++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index b98c7e96697a..56be4cd4edaf 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -456,35 +456,35 @@ static inline u8 vmx_get_rvi(void) return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } -#define BUILD_CONTROLS_SHADOW(lname, uname) \ -static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ -{ \ - if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ - vmcs_write32(uname, val); \ - vmx->loaded_vmcs->controls_shadow.lname = val; \ - } \ -} \ -static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ -{ \ - return vmcs->controls_shadow.lname; \ -} \ -static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ -{ \ - return __##lname##_controls_get(vmx->loaded_vmcs); \ -} \ -static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ -} \ -static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write##bits(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ +} \ +static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ +static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ +} \ +static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } -BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) -BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) -BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) /* * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the -- cgit v1.2.3 From 1ad4e5438c67a01620ed67cea959de89f4430515 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:34:00 +0800 Subject: KVM: VMX: Detect Tertiary VM-Execution control when setup VMCS config Check VMX features on tertiary execution control in VMCS config setup. Sub-features in tertiary execution control to be enabled are adjusted according to hardware capabilities although no sub-feature is enabled in this patch. EVMCSv1 doesn't support tertiary VM-execution control, so disable it when EVMCSv1 is in use. And define the auxiliary functions for Tertiary control field here, using the new BUILD_CONTROLS_SHADOW(). Reviewed-by: Maxim Levitsky Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153400.11642-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 3 +++ arch/x86/kvm/vmx/capabilities.h | 7 +++++++ arch/x86/kvm/vmx/evmcs.c | 2 ++ arch/x86/kvm/vmx/evmcs.h | 1 + arch/x86/kvm/vmx/vmcs.h | 1 + arch/x86/kvm/vmx/vmx.c | 29 ++++++++++++++++++++++++++++- arch/x86/kvm/vmx/vmx.h | 1 + 7 files changed, 43 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 6c343c6a1855..34ca428fefed 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -31,6 +31,7 @@ #define CPU_BASED_RDTSC_EXITING VMCS_CONTROL_BIT(RDTSC_EXITING) #define CPU_BASED_CR3_LOAD_EXITING VMCS_CONTROL_BIT(CR3_LOAD_EXITING) #define CPU_BASED_CR3_STORE_EXITING VMCS_CONTROL_BIT(CR3_STORE_EXITING) +#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS VMCS_CONTROL_BIT(TERTIARY_CONTROLS) #define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING) #define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING) #define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR) @@ -221,6 +222,8 @@ enum vmcs_field { ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, + TERTIARY_VM_EXEC_CONTROL = 0x00002034, + TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 3f430e218375..31f3d88b3e4d 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -59,6 +59,7 @@ struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; + u64 cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; struct nested_vmx_msrs nested; @@ -131,6 +132,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void) CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } +static inline bool cpu_has_tertiary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; +} + static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 87e3dc10edf4..6a61b1ae7942 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -297,8 +297,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); #if IS_ENABLED(CONFIG_HYPERV) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { + vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL; vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmcs_conf->cpu_based_3rd_exec_ctrl = 0; vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 8d70f9aea94b..f886a8ff0342 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -50,6 +50,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); */ #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) #define EVMCS1_UNSUPPORTED_2NDEXEC \ (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 2b9d7a7e83f7..ac290a44a693 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -50,6 +50,7 @@ struct vmcs_controls_shadow { u32 pin; u32 exec; u32 secondary_exec; + u64 tertiary_exec; }; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9714ae95589f..9d3d41b21059 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2412,6 +2412,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } +static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +{ + u64 allowed; + + rdmsrl(msr, allowed); + + return ctl_opt & allowed; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2420,6 +2429,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; + u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; @@ -2441,7 +2451,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -2515,6 +2526,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, "1-setting enable VPID VM-execution control\n"); } + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { + u64 opt3 = 0; + + _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, + MSR_IA32_VMX_PROCBASED_CTLS3); + } + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; @@ -2601,6 +2619,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; @@ -4222,6 +4241,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4387,6 +4411,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 56be4cd4edaf..c37befcea2c0 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -485,6 +485,7 @@ BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) /* * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the -- cgit v1.2.3 From 0b85baa5f46de1c6ad6e4b987905df041f2f80f0 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:34:41 +0800 Subject: KVM: VMX: Report tertiary_exec_control field in dump_vmcs() Add tertiary_exec_control field report in dump_vmcs(). Meanwhile, reorganize the dump output of VMCS category as follows. Before change: *** Control State *** PinBased=0x000000ff CPUBased=0xb5a26dfa SecondaryExec=0x061037eb EntryControls=0000d1ff ExitControls=002befff After change: *** Control State *** CPUBased=0xb5a26dfa SecondaryExec=0x061037eb TertiaryExec=0x0000000000000010 PinBased=0x000000ff EntryControls=0000d1ff ExitControls=002befff Reviewed-by: Maxim Levitsky Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153441.11687-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9d3d41b21059..4238a55c26e6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5872,6 +5872,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; unsigned long cr4; int efer_slot; @@ -5885,9 +5886,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - secondary_exec_control = 0; + if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); @@ -5987,9 +5995,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), -- cgit v1.2.3 From 5413bcba7ed57206178d60ee03dd5bb3a460e645 Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:35:16 +0800 Subject: KVM: x86: Add support for vICR APIC-write VM-Exits in x2APIC mode Upcoming Intel CPUs will support virtual x2APIC MSR writes to the vICR, i.e. will trap and generate an APIC-write VM-Exit instead of intercepting the WRMSR. Add support for handling "nodecode" x2APIC writes, which were previously impossible. Note, x2APIC MSR writes are 64 bits wide. Signed-off-by: Zeng Guang Message-Id: <20220419153516.11739-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f1bdac3f5aa8..39b805666a18 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -67,6 +67,7 @@ static bool lapic_timer_advance_dynamic __read_mostly; #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 +static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) { @@ -2231,10 +2232,27 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { - u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset); + struct kvm_lapic *apic = vcpu->arch.apic; + u64 val; + + if (apic_x2apic_mode(apic)) { + /* + * When guest APIC is in x2APIC mode and IPI virtualization + * is enabled, accessing APIC_ICR may cause trap-like VM-exit + * on Intel hardware. Other offsets are not possible. + */ + if (WARN_ON_ONCE(offset != APIC_ICR)) + return; - /* TODO: optimize to just emulate side effect w/o one more write */ - kvm_lapic_reg_write(vcpu->arch.apic, offset, val); + kvm_lapic_msr_read(apic, offset, &val); + kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); + trace_kvm_apic_write(APIC_ICR, val); + } else { + val = kvm_lapic_get_reg(apic, offset); + + /* TODO: optimize to just emulate side effect w/o one more write */ + kvm_lapic_reg_write(apic, offset, (u32)val); + } } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); -- cgit v1.2.3 From f08a06c9a35706349f74b7a18deefe3f89f73e8e Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:36:04 +0800 Subject: KVM: VMX: Clean up vmx_refresh_apicv_exec_ctrl() Remove the condition check cpu_has_secondary_exec_ctrls(). Calling vmx_refresh_apicv_exec_ctrl() premises secondary controls activated and VMCS fields related to APICv valid as well. If it's invoked in wrong circumstance at the worst case, VMX operation will report VMfailValid error without further harmful impact and just functions as if all the secondary controls were 0. Suggested-by: Sean Christopherson Signed-off-by: Zeng Guang Message-Id: <20220419153604.11786-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4238a55c26e6..3cbe3a38b356 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4201,16 +4201,15 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - } + + if (kvm_vcpu_apicv_active(vcpu)) + secondary_exec_controls_setbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + else + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); vmx_update_msr_bitmap_x2apic(vcpu); } -- cgit v1.2.3 From 1d5e740d518e02cea46325b3d37135bf9c08982a Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:44:09 +0800 Subject: KVM: Move kvm_arch_vcpu_precreate() under kvm->lock kvm_arch_vcpu_precreate() targets to handle arch specific VM resource to be prepared prior to the actual creation of vCPU. For example, x86 platform may need do per-VM allocation based on max_vcpu_ids at the first vCPU creation. It probably leads to concurrency control on this allocation as multiple vCPU creation could happen simultaneously. From the architectual point of view, it's necessary to execute kvm_arch_vcpu_precreate() under protect of kvm->lock. Currently only arm64, x86 and s390 have non-nop implementations at the stage of vCPU pre-creation. Remove the lock acquiring in s390's design and make sure all architecture can run kvm_arch_vcpu_precreate() safely under kvm->lock without recrusive lock issue. Suggested-by: Sean Christopherson Signed-off-by: Zeng Guang Message-Id: <20220419154409.11842-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 2 -- arch/x86/kvm/x86.c | 2 +- virt/kvm/kvm_main.c | 10 ++++++---- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ff457a77e22b..72bd5c9b9617 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3238,9 +3238,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) if (!sclp.has_esca || !sclp.has_64bscao) return false; - mutex_lock(&kvm->lock); rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); - mutex_unlock(&kvm->lock); return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 501606e02688..9c2e2b6d1767 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11242,7 +11242,7 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7f79abdbd68d..6dbdcbda0291 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3768,13 +3768,15 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) return -EINVAL; } + r = kvm_arch_vcpu_precreate(kvm, id); + if (r) { + mutex_unlock(&kvm->lock); + return r; + } + kvm->created_vcpus++; mutex_unlock(&kvm->lock); - r = kvm_arch_vcpu_precreate(kvm, id); - if (r) - goto vcpu_decrement; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu) { r = -ENOMEM; -- cgit v1.2.3 From 35875316384b71d23dc2a45a969732fc8cab16af Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:44:44 +0800 Subject: KVM: x86: Allow userspace to set maximum VCPU id for VM Introduce new max_vcpu_ids in KVM for x86 architecture. Userspace can assign maximum possible vcpu id for current VM session using KVM_CAP_MAX_VCPU_ID of KVM_ENABLE_CAP ioctl(). This is done for x86 only because the sole use case is to guide memory allocation for PID-pointer table, a structure needed to enable VMX IPI. By default, max_vcpu_ids set as KVM_MAX_VCPU_IDS. Suggested-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Zeng Guang Message-Id: <20220419154444.11888-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/x86.c | 20 ++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 94d73ec52aba..421479a67da5 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7494,6 +7494,27 @@ The valid bits in cap.args[0] are: generate a #UD within the guest. =================================== ============================================ +7.32 KVM_CAP_MAX_VCPU_ID +------------------------ + +:Architectures: x86 +:Target: VM +:Parameters: args[0] - maximum APIC ID value set for current VM +:Returns: 0 on success, -EINVAL if args[0] is beyond KVM_MAX_VCPU_IDS + supported in KVM or if it has been set. + +This capability allows userspace to specify maximum possible APIC ID +assigned for current VM session prior to the creation of vCPUs, saving +memory for data structures indexed by the APIC ID. Userspace is able +to calculate the limit to APIC ID values from designated +CPU topology. + +The value can be changed only until KVM_ENABLE_CAP is set to a nonzero +value or until a vCPU is created. Upon creation of the first vCPU, +if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM +uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as +the maximum APIC ID. + 8. Other capabilities. ====================== diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8109805b5429..9dc8d6d0a67d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1243,6 +1243,12 @@ struct kvm_arch { hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; #endif + /* + * VM-scope maximum vCPU ID. Used to determine the size of structures + * that increase along with the maximum vCPU ID, in which case, using + * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. + */ + u32 max_vcpu_ids; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9c2e2b6d1767..25fbb90c7c93 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6087,6 +6087,20 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_IDS) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -11246,6 +11260,12 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + return 0; } -- cgit v1.2.3 From d588bb9be1da6aa750aa64875fe57369db983d8b Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Tue, 19 Apr 2022 23:45:10 +0800 Subject: KVM: VMX: enable IPI virtualization With IPI virtualization enabled, the processor emulates writes to APIC registers that would send IPIs. The processor sets the bit corresponding to the vector in target vCPU's PIR and may send a notification (IPI) specified by NDST and NV fields in target vCPU's Posted-Interrupt Descriptor (PID). It is similar to what IOMMU engine does when dealing with posted interrupt from devices. A PID-pointer table is used by the processor to locate the PID of a vCPU with the vCPU's APIC ID. The table size depends on maximum APIC ID assigned for current VM session from userspace. Allocating memory for PID-pointer table is deferred to vCPU creation, because irqchip mode and VM-scope maximum APIC ID is settled at that point. KVM can skip PID-pointer table allocation if !irqchip_in_kernel(). Like VT-d PI, if a vCPU goes to blocked state, VMM needs to switch its notification vector to wakeup vector. This can ensure that when an IPI for blocked vCPUs arrives, VMM can get control and wake up blocked vCPUs. And if a VCPU is preempted, its posted interrupt notification is suppressed. Note that IPI virtualization can only virualize physical-addressing, flat mode, unicast IPIs. Sending other IPIs would still cause a trap-like APIC-write VM-exit and need to be handled by VMM. Signed-off-by: Chao Gao Signed-off-by: Zeng Guang Message-Id: <20220419154510.11938-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/vmx.h | 8 ++++ arch/x86/include/asm/vmxfeatures.h | 2 + arch/x86/kvm/vmx/capabilities.h | 6 +++ arch/x86/kvm/vmx/posted_intr.c | 15 ++++++- arch/x86/kvm/vmx/posted_intr.h | 2 + arch/x86/kvm/vmx/vmx.c | 82 +++++++++++++++++++++++++++++++++++--- arch/x86/kvm/vmx/vmx.h | 7 ++++ arch/x86/kvm/x86.c | 2 +- 10 files changed, 119 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index da47f60a4650..6f2f1affbb78 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) KVM_X86_OP_OPTIONAL(vm_destroy) +KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9dc8d6d0a67d..8118c52e3fec 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1347,6 +1347,7 @@ struct kvm_x86_ops { void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ + int (*vcpu_precreate)(struct kvm *kvm); int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 34ca428fefed..89d2172787c5 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -76,6 +76,11 @@ #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) #define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) +/* + * Definitions of Tertiary Processor-Based VM-Execution Controls. + */ +#define TERTIARY_EXEC_IPI_VIRT VMCS_CONTROL_BIT(IPI_VIRT) + #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) #define PIN_BASED_VIRTUAL_NMIS VMCS_CONTROL_BIT(VIRTUAL_NMIS) @@ -159,6 +164,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc) enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, POSTED_INTR_NV = 0x00000002, + LAST_PID_POINTER_INDEX = 0x00000008, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -224,6 +230,8 @@ enum vmcs_field { TSC_MULTIPLIER_HIGH = 0x00002033, TERTIARY_VM_EXEC_CONTROL = 0x00002034, TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + PID_POINTER_TABLE = 0x00002042, + PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index ff20776dc83b..589608c157bf 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -86,4 +86,6 @@ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ #define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ +/* Tertiary Processor-Based VM-Execution Controls, word 3 */ +#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 31f3d88b3e4d..5f656c9e33be 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -13,6 +13,7 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; +extern bool __read_mostly enable_ipiv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 @@ -283,6 +284,11 @@ static inline bool cpu_has_vmx_apicv(void) cpu_has_vmx_posted_intr(); } +static inline bool cpu_has_vmx_ipiv(void) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 07e5fcf5a5aa..237a1f40f939 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -177,11 +177,24 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) local_irq_restore(flags); } +static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) +{ + /* + * The default posted interrupt vector does nothing when + * invoked outside guest mode. Return whether a blocked vCPU + * can be the target of posted interrupts, as is the case when + * using either IPI virtualization or VT-d PI, so that the + * notification vector is switched to the one that calls + * back to the pi_wakeup_handler() function. + */ + return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); +} + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!vmx_can_use_vtd_pi(vcpu->kvm)) + if (!vmx_needs_pi_wakeup(vcpu)) return; if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 9a45d5c9f116..26992076552e 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -5,6 +5,8 @@ #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 +#define PID_TABLE_ENTRY_VALID 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3cbe3a38b356..f3175dae25aa 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO); module_param(enable_apicv, bool, S_IRUGO); +bool __read_mostly enable_ipiv = true; +module_param(enable_ipiv, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -2527,7 +2530,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, } if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { - u64 opt3 = 0; + u64 opt3 = TERTIARY_EXEC_IPI_VIRT; _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, MSR_IA32_VMX_PROCBASED_CTLS3); @@ -3874,6 +3877,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + if (enable_ipiv) + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); } } @@ -4202,14 +4207,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (kvm_vcpu_apicv_active(vcpu)) + if (kvm_vcpu_apicv_active(vcpu)) { secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else + if (enable_ipiv) + tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } else { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } vmx_update_msr_bitmap_x2apic(vcpu); } @@ -4242,7 +4252,16 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) { - return vmcs_config.cpu_based_3rd_exec_ctrl; + u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; + + /* + * IPI virtualization relies on APICv. Disable IPI virtualization if + * APICv is inhibited. + */ + if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) + exec_control &= ~TERTIARY_EXEC_IPI_VIRT; + + return exec_control; } /* @@ -4390,10 +4409,42 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static inline int vmx_get_pid_table_order(struct kvm *kvm) +{ + return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); +} + +static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) +{ + struct page *pages; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_ipiv) + return 0; + + if (kvm_vmx->pid_table) + return 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + if (!pages) + return -ENOMEM; + + kvm_vmx->pid_table = (void *)page_address(pages); + return 0; +} + +static int vmx_vcpu_precreate(struct kvm *kvm) +{ + return vmx_alloc_ipiv_pid_table(kvm); +} + #define VMX_XSS_EXIT_BITMAP 0 static void init_vmcs(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); @@ -4425,7 +4476,12 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { + if (vmx_can_use_ipiv(&vmx->vcpu)) { + vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); + vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); + } + + if (!kvm_pause_in_guest(kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; @@ -7116,6 +7172,10 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + if (vmx_can_use_ipiv(vcpu)) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + return 0; free_vmcs: @@ -7750,6 +7810,13 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) return supported & BIT(reason); } +static void vmx_vm_destroy(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", @@ -7761,7 +7828,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + .vcpu_precreate = vmx_vcpu_precreate, .vcpu_create = vmx_vcpu_create, .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, @@ -8039,6 +8108,9 @@ static __init int hardware_setup(void) if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; + if (!enable_apicv || !cpu_has_vmx_ipiv()) + enable_ipiv = false; + if (cpu_has_vmx_tsc_scaling()) kvm_has_tsc_control = true; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c37befcea2c0..d7baedda79e5 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -366,6 +366,8 @@ struct kvm_vmx { unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + /* Posted Interrupt Descriptor (PID) table for IPI virtualization */ + u64 *pid_table; }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -581,4 +583,9 @@ static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) return (vmx_instr_info >> 28) & 0xf; } +static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) +{ + return lapic_in_kernel(vcpu) && enable_ipiv; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 25fbb90c7c93..37fb301f52af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11266,7 +11266,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (id >= kvm->arch.max_vcpu_ids) return -EINVAL; - return 0; + return static_call(kvm_x86_vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From fb358e0b811eec233f6db86d591b3af99d23c8e3 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:30 +0800 Subject: perf/x86/intel: Add EPT-Friendly PEBS for Ice Lake Server Add support for EPT-Friendly PEBS, a new CPU feature that enlightens PEBS to translate guest linear address through EPT, and facilitates handling VM-Exits that occur when accessing PEBS records. More information can be found in the December 2021 release of Intel's SDM, Volume 3, 18.9.5 "EPT-Friendly PEBS". This new hardware facility makes sure the guest PEBS records will not be lost, which is available on Intel Ice Lake Server platforms (and later). KVM will check this field through perf_get_x86_pmu_capability() instead of hard coding the CPU models in the KVM code. If it is supported, the guest PEBS capability will be exposed to the guest. Guest PEBS can be enabled when and only when "EPT-Friendly PEBS" is supported and EPT is enabled. Cc: linux-perf-users@vger.kernel.org Signed-off-by: Like Xu Message-Id: <20220411101946.20262-2-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/core.c | 1 + arch/x86/events/intel/core.c | 1 + arch/x86/events/perf_event.h | 3 ++- arch/x86/include/asm/perf_event.h | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 30788894124f..a9ebd096dfb4 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -3002,5 +3002,6 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; + cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 45024abd929f..96bb0ac7462b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6138,6 +6138,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ICELAKE_D: + x86_pmu.pebs_ept = 1; pmem = true; fallthrough; case INTEL_FAM6_ICELAKE_L: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 21a5482bcf84..4910dc41433b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -818,7 +818,8 @@ struct x86_pmu { pebs_prec_dist :1, pebs_no_tlb :1, pebs_no_isolation :1, - pebs_block :1; + pebs_block :1, + pebs_ept :1; int pebs_record_size; int pebs_buffer_size; int max_pebs_events; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 409725e86f42..f95ab4da6fea 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -206,6 +206,7 @@ struct x86_pmu_capability { int bit_width_fixed; unsigned int events_mask; int events_mask_len; + unsigned int pebs_ept :1; }; /* -- cgit v1.2.3 From 69e575dd4fba51dca9f25db7b2033d730699e7ff Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:31 +0800 Subject: perf/x86/intel: Handle guest PEBS overflow PMI for KVM guest With PEBS virtualization, the guest PEBS records get delivered to the guest DS, and the host pmi handler uses perf_guest_cbs->is_in_guest() to distinguish whether the PMI comes from the guest code like Intel PT. No matter how many guest PEBS counters are overflowed, only triggering one fake event is enough. The fake event causes the KVM PMI callback to be called, thereby injecting the PEBS overflow PMI into the guest. KVM may inject the PMI with BUFFER_OVF set, even if the guest DS is empty. That should really be harmless. Thus guest PEBS handler would retrieve the correct information from its own PEBS records buffer. Cc: linux-perf-users@vger.kernel.org Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Signed-off-by: Like Xu Message-Id: <20220411101946.20262-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/intel/core.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 96bb0ac7462b..8e5036f32e84 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2852,6 +2852,47 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } +/* + * We may be running with guest PEBS events created by KVM, and the + * PEBS records are logged into the guest's DS and invisible to host. + * + * In the case of guest PEBS overflow, we only trigger a fake event + * to emulate the PEBS overflow PMI for guest PEBS counters in KVM. + * The guest will then vm-entry and check the guest DS area to read + * the guest PEBS records. + * + * The contents and other behavior of the guest event do not matter. + */ +static void x86_pmu_handle_guest_pebs(struct pt_regs *regs, + struct perf_sample_data *data) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 guest_pebs_idxs = cpuc->pebs_enabled & ~cpuc->intel_ctrl_host_mask; + struct perf_event *event = NULL; + int bit; + + if (!unlikely(perf_guest_state())) + return; + + if (!x86_pmu.pebs_ept || !x86_pmu.pebs_active || + !guest_pebs_idxs) + return; + + for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, + INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) { + event = cpuc->events[bit]; + if (!event->attr.precise_ip) + continue; + + perf_sample_data_init(data, 0, event->hw.last_period); + if (perf_event_overflow(event, data, regs)) + x86_pmu_stop(event, 0); + + /* Inject one fake event is enough. */ + break; + } +} + static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; @@ -2903,6 +2944,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) u64 pebs_enabled = cpuc->pebs_enabled; handled++; + x86_pmu_handle_guest_pebs(regs, &data); x86_pmu.drain_pebs(regs, &data); status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; -- cgit v1.2.3 From 39a4d779546a993c53cea28e659e8edc9f868af0 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:32 +0800 Subject: perf/x86/core: Pass "struct kvm_pmu *" to determine the guest values Splitting the logic for determining the guest values is unnecessarily confusing, and potentially fragile. Perf should have full knowledge and control of what values are loaded for the guest. If we change .guest_get_msrs() to take a struct kvm_pmu pointer, then it can generate the full set of guest values by grabbing guest ds_area and pebs_data_cfg. Alternatively, .guest_get_msrs() could take the desired guest MSR values directly (ds_area and pebs_data_cfg), but kvm_pmu is vendor agnostic, so we don't see any reason to not just pass the pointer. Suggested-by: Sean Christopherson Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-4-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/core.c | 4 ++-- arch/x86/events/intel/core.c | 4 ++-- arch/x86/events/perf_event.h | 2 +- arch/x86/include/asm/perf_event.h | 4 ++-- arch/x86/kvm/vmx/vmx.c | 3 ++- 5 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a9ebd096dfb4..330825160b9a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -693,9 +693,9 @@ void x86_pmu_disable_all(void) } } -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { - return static_call(x86_pmu_guest_get_msrs)(nr); + return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_GPL(perf_guest_get_msrs); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8e5036f32e84..b56ead52790a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3972,7 +3972,7 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } -static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; @@ -4005,7 +4005,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) return arr; } -static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4910dc41433b..07fdef4f9ad2 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -903,7 +903,7 @@ struct x86_pmu { /* * Intel host/guest support (KVM) */ - struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr, void *data); /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f95ab4da6fea..58e2fcbb8bcc 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -519,10 +519,10 @@ static inline void perf_check_microcode(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) -extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { return -1; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f3175dae25aa..070b02162db6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6794,9 +6794,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; struct perf_guest_switch_msr *msrs; + struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ - msrs = perf_guest_get_msrs(&nr_msrs); + msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) return; -- cgit v1.2.3 From bef6ecca46ac938ffb352d7fa2f6eafd1b6a41be Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:33 +0800 Subject: KVM: x86/pmu: Set MSR_IA32_MISC_ENABLE_EMON bit when vPMU is enabled On Intel platforms, the software can use the IA32_MISC_ENABLE[7] bit to detect whether the processor supports performance monitoring facility. It depends on the PMU is enabled for the guest, and a software write operation to this available bit will be ignored. The proposal to ignore the toggle in KVM is the way to go and that behavior matches bare metal. Signed-off-by: Like Xu Message-Id: <20220411101946.20262-5-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 1 + arch/x86/kvm/x86.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 37e9eb32e3d9..b7dd24476b52 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -498,6 +498,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; + vcpu->arch.ia32_misc_enable_msr |= MSR_IA32_MISC_ENABLE_EMON; perf_get_x86_pmu_capability(&x86_pmu); pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 37fb301f52af..68ec5cbeb665 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3558,9 +3558,19 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_tsc_adjust_msr = data; } break; - case MSR_IA32_MISC_ENABLE: + case MSR_IA32_MISC_ENABLE: { + u64 old_val = vcpu->arch.ia32_misc_enable_msr; + u64 pmu_mask = MSR_IA32_MISC_ENABLE_EMON; + + /* + * For a dummy user space, the order of setting vPMU capabilities and + * initialising MSR_IA32_MISC_ENABLE is not strictly guaranteed, so to + * avoid inconsistent functionality we keep the vPMU bits unchanged here. + */ + data &= ~pmu_mask; + data |= old_val & pmu_mask; if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && - ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; @@ -3569,6 +3579,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_misc_enable_msr = data; } break; + } case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; -- cgit v1.2.3 From 2c985527dd8d283e786ad7a67e532ef7f6f00fac Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:34 +0800 Subject: KVM: x86/pmu: Introduce the ctrl_mask value for fixed counter The mask value of fixed counter control register should be dynamic adjusted with the number of fixed counters. This patch introduces a variable that includes the reserved bits of fixed counter control registers. This is a generic code refactoring. Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-6-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/pmu_intel.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8118c52e3fec..7458abe81503 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -505,6 +505,7 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; + u64 fixed_ctr_ctrl_mask; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b7dd24476b52..c04d1235316d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -395,7 +395,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_FIXED_CTR_CTRL: if (pmu->fixed_ctr_ctrl == data) return 0; - if (!(data & 0xfffffffffffff444ull)) { + if (!(data & pmu->fixed_ctr_ctrl_mask)) { reprogram_fixed_counters(pmu, data); return 0; } @@ -479,6 +479,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + int i; pmu->nr_arch_gp_counters = 0; pmu->nr_arch_fixed_counters = 0; @@ -487,6 +488,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->fixed_ctr_ctrl_mask = ~0ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry || !vcpu->kvm->arch.enable_pmu) @@ -523,6 +525,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) setup_fixed_pmc_eventsel(pmu); } + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; -- cgit v1.2.3 From 0d23dc34a7cefde5ee25c321949579694edbd16d Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 11 Apr 2022 18:19:35 +0800 Subject: x86/perf/core: Add pebs_capable to store valid PEBS_COUNTER_MASK value The value of pebs_counter_mask will be accessed frequently for repeated use in the intel_guest_get_msrs(). So it can be optimized instead of endlessly mucking about with branches. Signed-off-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-7-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/intel/core.c | 14 ++++++-------- arch/x86/events/perf_event.h | 1 + 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index b56ead52790a..8c9cb41fc20a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2932,10 +2932,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - status &= ~cpuc->pebs_enabled; - else - status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); + status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); /* * PEBS overflow sets bit 62 in the global status register @@ -3981,10 +3978,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask; arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - arr[0].guest &= ~cpuc->pebs_enabled; - else - arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); + arr[0].guest &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); *nr = 1; if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) { @@ -5692,6 +5686,7 @@ __init int intel_pmu_init(void) x86_pmu.events_mask_len = eax.split.mask_length; x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + x86_pmu.pebs_capable = PEBS_COUNTER_MASK; /* * Quirk: v2 perfmon does not report fixed-purpose events, so @@ -5876,6 +5871,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.get_event_constraints = glp_get_event_constraints; @@ -6233,6 +6229,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.flags |= PMU_FL_PEBS_ALL; @@ -6278,6 +6275,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.flags |= PMU_FL_PEBS_ALL; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 07fdef4f9ad2..09c68265b577 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -828,6 +828,7 @@ struct x86_pmu { void (*pebs_aliases)(struct perf_event *event); unsigned long large_pebs_flags; u64 rtm_abort_event; + u64 pebs_capable; /* * Intel LBR -- cgit v1.2.3 From c59a1f106f5cd4843c097069ff1bb2ad72103a67 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:36 +0800 Subject: KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS If IA32_PERF_CAPABILITIES.PEBS_BASELINE [bit 14] is set, the IA32_PEBS_ENABLE MSR exists and all architecturally enumerated fixed and general-purpose counters have corresponding bits in IA32_PEBS_ENABLE that enable generation of PEBS records. The general-purpose counter bits start at bit IA32_PEBS_ENABLE[0], and the fixed counter bits start at bit IA32_PEBS_ENABLE[32]. When guest PEBS is enabled, the IA32_PEBS_ENABLE MSR will be added to the perf_guest_switch_msr() and atomically switched during the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR. Based on whether the platform supports x86_pmu.pebs_ept, it has also refactored the way to add more msrs to arr[] in intel_guest_get_msrs() for extensibility. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-8-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/intel/core.c | 75 ++++++++++++++++++++++++++++++---------- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/include/asm/msr-index.h | 6 ++++ arch/x86/kvm/vmx/pmu_intel.c | 31 +++++++++++++++++ arch/x86/kvm/x86.c | 1 + 5 files changed, 98 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8c9cb41fc20a..70a5c66789df 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3969,33 +3969,72 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } +/* + * Currently, the only caller of this function is the atomic_switch_perf_msrs(). + * The host perf conext helps to prepare the values of the real hardware for + * a set of msrs that need to be switched atomically in a vmx transaction. + * + * For example, the pseudocode needed to add a new msr should look like: + * + * arr[(*nr)++] = (struct perf_guest_switch_msr){ + * .msr = the hardware msr address, + * .host = the value the hardware has when it doesn't run a guest, + * .guest = the value the hardware has when it runs a guest, + * }; + * + * These values have nothing to do with the emulated values the guest sees + * when it uses {RD,WR}MSR, which should be handled by the KVM context, + * specifically in the intel_pmu_{get,set}_msr(). + */ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); + u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; + int global_ctrl, pebs_enable; + + *nr = 0; + global_ctrl = (*nr)++; + arr[global_ctrl] = (struct perf_guest_switch_msr){ + .msr = MSR_CORE_PERF_GLOBAL_CTRL, + .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, + .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), + }; - arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; - arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask; - arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; - arr[0].guest &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); - *nr = 1; + if (!x86_pmu.pebs) + return arr; - if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) { - /* - * If PMU counter has PEBS enabled it is not enough to - * disable counter on a guest entry since PEBS memory - * write can overshoot guest entry and corrupt guest - * memory. Disabling PEBS solves the problem. - * - * Don't do this if the CPU already enforces it. - */ - arr[1].msr = MSR_IA32_PEBS_ENABLE; - arr[1].host = cpuc->pebs_enabled; - arr[1].guest = 0; - *nr = 2; + /* + * If PMU counter has PEBS enabled it is not enough to + * disable counter on a guest entry since PEBS memory + * write can overshoot guest entry and corrupt guest + * memory. Disabling PEBS solves the problem. + * + * Don't do this if the CPU already enforces it. + */ + if (x86_pmu.pebs_no_isolation) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled, + .guest = 0, + }; + return arr; } + if (!x86_pmu.pebs_ept) + return arr; + pebs_enable = (*nr)++; + + arr[pebs_enable] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, + .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, + }; + + /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ + arr[0].guest |= arr[*nr].guest; + return arr; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7458abe81503..36a5650b9007 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -521,6 +521,9 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + u64 pebs_enable; + u64 pebs_enable_mask; + /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c194995b2e1f..bd1861c9cfa8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -196,6 +196,12 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index c04d1235316d..2cd4f8a751be 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -214,6 +214,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: ret = pmu->version > 1; break; + case MSR_IA32_PEBS_ENABLE: + ret = vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT; + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -361,6 +364,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; return 0; + case MSR_IA32_PEBS_ENABLE: + msr_info->data = pmu->pebs_enable; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -421,6 +427,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PEBS_ENABLE: + if (pmu->pebs_enable == data) + return 0; + if (!(data & pmu->pebs_enable_mask)) { + pmu->pebs_enable = data; + return 0; + } + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -489,6 +503,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry || !vcpu->kvm->arch.enable_pmu) @@ -560,6 +575,22 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) { + pmu->pebs_enable_mask = ~pmu->global_ctrl; + pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmu->fixed_ctr_ctrl_mask &= + ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + } + } else { + pmu->pebs_enable_mask = + ~((1ull << pmu->nr_arch_gp_counters) - 1); + } + } else { + vcpu->arch.perf_capabilities &= ~PERF_CAP_PEBS_MASK; + } } static void intel_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 68ec5cbeb665..12183c790ed1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1448,6 +1448,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + MSR_IA32_PEBS_ENABLE, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, -- cgit v1.2.3 From 79f3e3b58386a2fc05054367b905619f741beeb4 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:37 +0800 Subject: KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter When a guest counter is configured as a PEBS counter through IA32_PEBS_ENABLE, a guest PEBS event will be reprogrammed by configuring a non-zero precision level in the perf_event_attr. The guest PEBS overflow PMI bit would be set in the guest GLOBAL_STATUS MSR when PEBS facility generates a PEBS overflow PMI based on guest IA32_DS_AREA MSR. Even with the same counter index and the same event code and mask, guest PEBS events will not be reused for non-PEBS events. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-9-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3f868fed9114..cdefcb091ac8 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -86,15 +86,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); + bool skip_pmi = false; /* Ignore counters that have been reprogrammed already. */ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) return; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { + /* Indicate PEBS overflow PMI to guest. */ + skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, + (unsigned long *)&pmu->global_status); + } else { + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + } kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - if (!pmc->intr) + if (!pmc->intr || skip_pmi) return; /* @@ -124,6 +131,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; struct perf_event_attr attr = { .type = type, @@ -135,6 +143,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) return; @@ -150,6 +159,23 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, */ attr.sample_period = 0; } + if (pebs) { + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + * + * For most PEBS hardware events, the difference in the software + * precision levels of guest and host PEBS events will not affect + * the accuracy of the PEBS profiling result, because the "event IP" + * in the PEBS record is calibrated on the guest side. + * + * On Icelake everything is fine. Other hardware (GLC+, TNT+) that + * could possibly care here is unsupported and needs changes. + */ + attr.precise_ip = 1; + } event = perf_event_create_kernel_counter(&attr, -1, current, kvm_perf_overflow, pmc); @@ -163,7 +189,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; - pmc->intr = intr; + pmc->intr = intr || pebs; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -189,6 +215,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter))) return false; + if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) && + pmc->perf_event->attr.precise_ip) + return false; + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); pmc->is_paused = false; -- cgit v1.2.3 From 6ebe44366bdeaf3059f2b644bbd99824ae824228 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:38 +0800 Subject: KVM: x86/pmu: Adjust precise_ip to emulate Ice Lake guest PDIR counter The PEBS-PDIR facility on Ice Lake server is supported on IA31_FIXED0 only. If the guest configures counter 32 and PEBS is enabled, the PEBS-PDIR facility is supposed to be used, in which case KVM adjusts attr.precise_ip to 3 and request host perf to assign the exactly requested counter or fail. The CPU model check is also required since some platforms may place the PEBS-PDIR facility in another counter index. Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-10-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/intel/core.c | 2 +- arch/x86/kvm/pmu.c | 2 ++ arch/x86/kvm/pmu.h | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 70a5c66789df..7ef7fd4ab29b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4024,8 +4024,8 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) if (!x86_pmu.pebs_ept) return arr; - pebs_enable = (*nr)++; + pebs_enable = (*nr)++; arr[pebs_enable] = (struct perf_guest_switch_msr){ .msr = MSR_IA32_PEBS_ENABLE, .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index cdefcb091ac8..162dfe23c071 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -175,6 +175,8 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, * could possibly care here is unsupported and needs changes. */ attr.precise_ip = 1; + if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) + attr.precise_ip = 3; } event = perf_event_create_kernel_counter(&attr, -1, current, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index e745f443b6a8..5966ce18a82b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -4,6 +4,8 @@ #include +#include + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) @@ -15,6 +17,12 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 +static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + {} +}; + struct kvm_event_hw_type_mapping { u8 eventsel; u8 unit_mask; -- cgit v1.2.3 From 8183a538cd95f72f11871b35726256ec3bcb9439 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:39 +0800 Subject: KVM: x86/pmu: Add IA32_DS_AREA MSR emulation to support guest DS When CPUID.01H:EDX.DS[21] is set, the IA32_DS_AREA MSR exists and points to the linear address of the first byte of the DS buffer management area, which is used to manage the PEBS records. When guest PEBS is enabled, the MSR_IA32_DS_AREA MSR will be added to the perf_guest_switch_msr() and switched during the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR. The WRMSR to IA32_DS_AREA MSR brings a #GP(0) if the source register contains a non-canonical address. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-11-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/intel/core.c | 10 +++++++++- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/pmu_intel.c | 11 +++++++++++ arch/x86/kvm/x86.c | 2 +- 4 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7ef7fd4ab29b..e5e624cae957 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -3990,6 +3991,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; int global_ctrl, pebs_enable; @@ -4022,9 +4024,15 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) return arr; } - if (!x86_pmu.pebs_ept) + if (!kvm_pmu || !x86_pmu.pebs_ept) return arr; + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_DS_AREA, + .host = (unsigned long)cpuc->ds, + .guest = kvm_pmu->ds_area, + }; + pebs_enable = (*nr)++; arr[pebs_enable] = (struct perf_guest_switch_msr){ .msr = MSR_IA32_PEBS_ENABLE, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 36a5650b9007..dc5f68a313b0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -521,6 +521,7 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + u64 ds_area; u64 pebs_enable; u64 pebs_enable_mask; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 2cd4f8a751be..36ba29b664bf 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -217,6 +217,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_IA32_PEBS_ENABLE: ret = vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT; break; + case MSR_IA32_DS_AREA: + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -367,6 +370,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_PEBS_ENABLE: msr_info->data = pmu->pebs_enable; return 0; + case MSR_IA32_DS_AREA: + msr_info->data = pmu->ds_area; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -435,6 +441,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_DS_AREA: + if (is_noncanonical_address(data, vcpu)) + return 1; + pmu->ds_area = data; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12183c790ed1..ead86072612d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1448,7 +1448,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, - MSR_IA32_PEBS_ENABLE, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, -- cgit v1.2.3 From 902caeb6841a64072791b1c18f9f56089566865d Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:40 +0800 Subject: KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If IA32_PERF_CAPABILITIES.PEBS_BASELINE [bit 14] is set, the adaptive PEBS is supported. The PEBS_DATA_CFG MSR and adaptive record enable bits (IA32_PERFEVTSELx.Adaptive_Record and IA32_FIXED_CTR_CTRL. FCx_Adaptive_Record) are also supported. Adaptive PEBS provides software the capability to configure the PEBS records to capture only the data of interest, keeping the record size compact. An overflow of PMCx results in generation of an adaptive PEBS record with state information based on the selections specified in MSR_PEBS_DATA_CFG.By default, the record only contain the Basic group. When guest adaptive PEBS is enabled, the IA32_PEBS_ENABLE MSR will be added to the perf_guest_switch_msr() and switched during the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR. According to Intel SDM, software is recommended to PEBS Baseline when the following is true. IA32_PERF_CAPABILITIES.PEBS_BASELINE[14] && IA32_PERF_CAPABILITIES.PEBS_FMT[11:8] ≥ 4. Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Message-Id: <20220411101946.20262-12-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/intel/core.c | 8 ++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/vmx/pmu_intel.c | 20 +++++++++++++++++++- arch/x86/kvm/x86.c | 2 +- 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e5e624cae957..8f6189f2fbf3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4033,6 +4033,14 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) .guest = kvm_pmu->ds_area, }; + if (x86_pmu.intel_cap.pebs_baseline) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_PEBS_DATA_CFG, + .host = cpuc->pebs_data_cfg, + .guest = kvm_pmu->pebs_data_cfg, + }; + } + pebs_enable = (*nr)++; arr[pebs_enable] = (struct perf_guest_switch_msr){ .msr = MSR_IA32_PEBS_ENABLE, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dc5f68a313b0..d99c130d0a13 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -524,6 +524,8 @@ struct kvm_pmu { u64 ds_area; u64 pebs_enable; u64 pebs_enable_mask; + u64 pebs_data_cfg; + u64 pebs_data_cfg_mask; /* * The gate to release perf_events not marked in diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 36ba29b664bf..69eb5372c922 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -205,6 +205,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 perf_capabilities = vcpu->arch.perf_capabilities; int ret; switch (msr) { @@ -215,11 +216,15 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) ret = pmu->version > 1; break; case MSR_IA32_PEBS_ENABLE: - ret = vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT; + ret = perf_capabilities & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); break; + case MSR_PEBS_DATA_CFG: + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -373,6 +378,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_DS_AREA: msr_info->data = pmu->ds_area; return 0; + case MSR_PEBS_DATA_CFG: + msr_info->data = pmu->pebs_data_cfg; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -446,6 +454,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; pmu->ds_area = data; return 0; + case MSR_PEBS_DATA_CFG: + if (pmu->pebs_data_cfg == data) + return 0; + if (!(data & pmu->pebs_data_cfg_mask)) { + pmu->pebs_data_cfg = data; + return 0; + } + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -515,6 +531,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->raw_event_mask = X86_RAW_EVENT_MASK; pmu->fixed_ctr_ctrl_mask = ~0ull; pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry || !vcpu->kvm->arch.enable_pmu) @@ -595,6 +612,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->fixed_ctr_ctrl_mask &= ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); } + pmu->pebs_data_cfg_mask = ~0xff00000full; } else { pmu->pebs_enable_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ead86072612d..2d9456b4874b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1448,7 +1448,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, -- cgit v1.2.3 From d10551738f6adcc3e6040fc846b171e72e94f0e9 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:41 +0800 Subject: KVM: x86: Set PEBS_UNAVAIL in IA32_MISC_ENABLE when PEBS is enabled The bit 12 represents "Processor Event Based Sampling Unavailable (RO)" : 1 = PEBS is not supported. 0 = PEBS is supported. A write to this PEBS_UNAVL available bit will bring #GP(0) when guest PEBS is enabled. Some PEBS drivers in guest may care about this bit. Signed-off-by: Like Xu Message-Id: <20220411101946.20262-13-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 2 ++ arch/x86/kvm/x86.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 69eb5372c922..02cad8e08ed0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -605,6 +605,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) { + vcpu->arch.ia32_misc_enable_msr &= ~MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_mask = ~pmu->global_ctrl; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; @@ -618,6 +619,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ~((1ull << pmu->nr_arch_gp_counters) - 1); } } else { + vcpu->arch.ia32_misc_enable_msr |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; vcpu->arch.perf_capabilities &= ~PERF_CAP_PEBS_MASK; } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2d9456b4874b..94b92381cc8b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3561,7 +3561,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_MISC_ENABLE: { u64 old_val = vcpu->arch.ia32_misc_enable_msr; - u64 pmu_mask = MSR_IA32_MISC_ENABLE_EMON; + u64 pmu_mask = MSR_IA32_MISC_ENABLE_EMON | + MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; + + /* RO bits */ + if (!msr_info->host_initiated && + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) + return 1; /* * For a dummy user space, the order of setting vPMU capabilities and -- cgit v1.2.3 From 63f21f326fc9e068d04c2c1d0a722e8db65588ba Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:42 +0800 Subject: KVM: x86/pmu: Move pmc_speculative_in_use() to arch/x86/kvm/pmu.h It allows this inline function to be reused by more callers in more files, such as pmu_intel.c. Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-14-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 11 ----------- arch/x86/kvm/pmu.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 162dfe23c071..6e6898ef0011 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -503,17 +503,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (pmc_is_fixed(pmc)) - return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; - - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; -} - /* Release perf_events for vPMCs that have been unused for a full time slice. */ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5966ce18a82b..d01c883ca95c 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -152,6 +152,17 @@ static inline void pmc_update_sample_period(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter)); } +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); -- cgit v1.2.3 From 854250329c02c0616a42532d65e81365272326d1 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:43 +0800 Subject: KVM: x86/pmu: Disable guest PEBS temporarily in two rare situations The guest PEBS will be disabled when some users try to perf KVM and its user-space through the same PEBS facility OR when the host perf doesn't schedule the guest PEBS counter in a one-to-one mapping manner (neither of these are typical scenarios). The PEBS records in the guest DS buffer are still accurate and the above two restrictions will be checked before each vm-entry only if guest PEBS is deemed to be enabled. Suggested-by: Wei Wang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-15-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/intel/core.c | 11 +++++++++-- arch/x86/include/asm/kvm_host.h | 9 +++++++++ arch/x86/kvm/vmx/pmu_intel.c | 20 ++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 4 ++++ arch/x86/kvm/vmx/vmx.h | 1 + 5 files changed, 43 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8f6189f2fbf3..39832a5e7d75 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4048,8 +4048,15 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, }; - /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ - arr[0].guest |= arr[*nr].guest; + if (arr[pebs_enable].host) { + /* Disable guest PEBS if host PEBS is enabled. */ + arr[pebs_enable].guest = 0; + } else { + /* Disable guest PEBS for cross-mapped PEBS counters. */ + arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ + arr[global_ctrl].guest |= arr[pebs_enable].guest; + } return arr; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d99c130d0a13..032278f0ee6d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -527,6 +527,15 @@ struct kvm_pmu { u64 pebs_data_cfg; u64 pebs_data_cfg_mask; + /* + * If a guest counter is cross-mapped to host counter with different + * index, its PEBS capability will be temporarily disabled. + * + * The user should make sure that this mask is updated + * after disabling interrupts and before perf_guest_get_msrs(); + */ + u64 host_cross_mapped_mask; + /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 02cad8e08ed0..cc3d2a768320 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -786,6 +786,26 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) +{ + struct kvm_pmc *pmc = NULL; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + + if (!pmc || !pmc_speculative_in_use(pmc) || + !intel_pmc_is_enabled(pmc)) + continue; + + if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) { + pmu->host_cross_mapped_mask |= + BIT_ULL(pmc->perf_event->hw.idx); + } + } +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { .pmc_perf_hw_id = intel_pmc_perf_hw_id, .pmc_is_enabled = intel_pmc_is_enabled, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 070b02162db6..d5ec0635ccd4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6796,6 +6796,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) struct perf_guest_switch_msr *msrs; struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d7baedda79e5..2d6d7870a974 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -94,6 +94,7 @@ union vmx_exit_reason { #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 968635abd5f5986f3cb6f15602d365cf1b551c5d Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:44 +0800 Subject: KVM: x86/pmu: Add kvm_pmu_cap to optimize perf_get_x86_pmu_capability The information obtained from the interface perf_get_x86_pmu_capability() doesn't change, so an exported "struct x86_pmu_capability" is introduced for all guests in the KVM, and it's initialized before hardware_setup(). Signed-off-by: Like Xu Message-Id: <20220411101946.20262-16-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 27 ++++++++------------------- arch/x86/kvm/pmu.c | 3 +++ arch/x86/kvm/pmu.h | 19 +++++++++++++++++++ arch/x86/kvm/vmx/pmu_intel.c | 17 ++++++++--------- arch/x86/kvm/x86.c | 9 ++++----- 5 files changed, 42 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index de6d44e07e34..211f4566641e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -868,7 +868,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) case 9: break; case 0xa: { /* Architectural Performance Monitoring */ - struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; @@ -877,30 +876,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } - perf_get_x86_pmu_capability(&cap); + eax.split.version_id = kvm_pmu_cap.version; + eax.split.num_counters = kvm_pmu_cap.num_counters_gp; + eax.split.bit_width = kvm_pmu_cap.bit_width_gp; + eax.split.mask_length = kvm_pmu_cap.events_mask_len; + edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; + edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; - /* - * The guest architecture pmu is only supported if the architecture - * pmu exists on the host and the module parameters allow it. - */ - if (!cap.version || !enable_pmu) - memset(&cap, 0, sizeof(cap)); - - eax.split.version_id = min(cap.version, 2); - eax.split.num_counters = cap.num_counters_gp; - eax.split.bit_width = cap.bit_width_gp; - eax.split.mask_length = cap.events_mask_len; - - edx.split.num_counters_fixed = - min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED); - edx.split.bit_width_fixed = cap.bit_width_fixed; - if (cap.version) + if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; entry->eax = eax.full; - entry->ebx = cap.events_mask; + entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 6e6898ef0011..a8f0618a5ebe 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -24,6 +24,9 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +struct x86_pmu_capability __read_mostly kvm_pmu_cap; +EXPORT_SYMBOL_GPL(kvm_pmu_cap); + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index d01c883ca95c..9e25cdd2bd1a 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -163,6 +163,24 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; } +extern struct x86_pmu_capability kvm_pmu_cap; + +static inline void kvm_init_pmu_capability(void) +{ + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * Only support guest architectural pmu on + * a host with architectural pmu. + */ + if (!kvm_pmu_cap.version) + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_PMC_MAX_FIXED); +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); @@ -181,6 +199,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); +void kvm_init_pmu_capability(void); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index cc3d2a768320..83d1081cbd3c 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -515,8 +515,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - - struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; @@ -544,13 +542,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) return; vcpu->arch.ia32_misc_enable_msr |= MSR_IA32_MISC_ENABLE_EMON; - perf_get_x86_pmu_capability(&x86_pmu); pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - x86_pmu.num_counters_gp); - eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); + kvm_pmu_cap.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, + kvm_pmu_cap.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); + eax.split.mask_length = min_t(int, eax.split.mask_length, + kvm_pmu_cap.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -560,9 +559,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = min3(ARRAY_SIZE(fixed_pmc_events), (size_t) edx.split.num_counters_fixed, - (size_t) x86_pmu.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, - edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); + (size_t)kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; setup_fixed_pmc_eventsel(pmu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94b92381cc8b..32218fac4504 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6667,15 +6667,12 @@ out: static void kvm_init_msr_list(void) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, "Please update the fixed PMCs in msrs_to_saved_all[]"); - perf_get_x86_pmu_capability(&x86_pmu); - num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; @@ -6727,12 +6724,12 @@ static void kvm_init_msr_list(void) break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_IA32_XFD: @@ -11721,6 +11718,8 @@ int kvm_arch_hardware_setup(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_pmu_capability(); + r = ops->hardware_setup(); if (r != 0) return r; -- cgit v1.2.3 From 59cc99f6e971bb24b40e27f695daab98e2eff4b8 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:45 +0800 Subject: KVM: x86/cpuid: Refactor host/guest CPU model consistency check For the same purpose, the leagcy intel_pmu_lbr_is_compatible() can be renamed for reuse by more callers, and remove the comment about LBR use case can be deleted by the way. Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-17-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 5 +++++ arch/x86/kvm/vmx/pmu_intel.c | 12 +----------- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 1 - 4 files changed, 7 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 8a770b481d9d..ac72aabba981 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -145,6 +145,11 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) return x86_model(best->eax); } +static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu) +{ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 83d1081cbd3c..ca219a54a53e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -167,16 +167,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) -{ - /* - * As a first step, a guest could only enable LBR feature if its - * cpu model is the same as the host because the LBR registers - * would be pass-through to the guest and they're model specific. - */ - return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); -} - bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) { struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); @@ -595,7 +585,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) nested_vmx_pmu_refresh(vcpu, intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); - if (intel_pmu_lbr_is_compatible(vcpu)) + if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); else lbr_desc->records.nr = 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d5ec0635ccd4..403a8834cc79 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2242,7 +2242,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; - if (!intel_pmu_lbr_is_compatible(vcpu)) + if (!cpuid_model_is_consistent(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 2d6d7870a974..71bcb486e73f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -95,7 +95,6 @@ union vmx_exit_reason { #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From cf8e55fe50df0c0292b389a165daa81193cd39d1 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:46 +0800 Subject: KVM: x86/pmu: Expose CPUIDs feature bits PDCM, DS, DTES64 The CPUID features PDCM, DS and DTES64 are required for PEBS feature. KVM would expose CPUID feature PDCM, DS and DTES64 to guest when PEBS is supported in the KVM on the Ice Lake server platforms. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Message-Id: <20220411101946.20262-18-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/capabilities.h | 28 +++++++++++++++++----------- arch/x86/kvm/vmx/vmx.c | 15 +++++++++++++++ 2 files changed, 32 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 5f656c9e33be..f14c4bef97e0 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -6,6 +6,7 @@ #include "lapic.h" #include "x86.h" +#include "pmu.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -398,23 +399,28 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } -static inline u64 vmx_get_perf_capabilities(void) +static inline bool vmx_pebs_supported(void) { - u64 perf_cap = 0; + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; +} - if (!enable_pmu) - return perf_cap; +static inline u64 vmx_get_perf_capabilities(void) +{ + u64 perf_cap = PMU_CAP_FW_WRITES; + u64 host_perf_cap = 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - perf_cap &= PMU_CAP_LBR_FMT; + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - /* - * Since counters are virtualized, KVM would support full - * width counting unconditionally, even if the host lacks it. - */ - return PMU_CAP_FW_WRITES | perf_cap; + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; } static inline u64 vmx_supported_debugctl(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 403a8834cc79..55a8578255cb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2245,6 +2245,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!cpuid_model_is_consistent(vcpu)) return 1; } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } ret = kvm_set_msr_common(vcpu, msr_info); break; @@ -7517,6 +7528,10 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); -- cgit v1.2.3 From 43d62d108af87e683ebe41ffd76ac60594544de3 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 19 May 2022 01:01:16 +0800 Subject: KVM: x86/pmu: Move the vmx_icl_pebs_cpu[] definition out of the header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defining a static const array in a header file would introduce redundant definitions to the point of confusing semantics, and such a use case would only bring complaints from the compiler: arch/x86/kvm/pmu.h:20:32: warning: ‘vmx_icl_pebs_cpu’ defined but not used [-Wunused-const-variable=] 20 | static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { | ^~~~~~~~~~~~~~~~ Fixes: a095df2c5f48 ("KVM: x86/pmu: Adjust precise_ip to emulate Ice Lake guest PDIR counter") Signed-off-by: Like Xu Message-Id: <20220518170118.66263-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 7 +++++++ arch/x86/kvm/pmu.h | 8 -------- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a8f0618a5ebe..1ed63ea19c00 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "x86.h" #include "cpuid.h" #include "lapic.h" @@ -27,6 +28,12 @@ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); +static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + {} +}; + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 9e25cdd2bd1a..796ff839493b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -4,8 +4,6 @@ #include -#include - #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) @@ -17,12 +15,6 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 -static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), - {} -}; - struct kvm_event_hw_type_mapping { u8 eventsel; u8 unit_mask; -- cgit v1.2.3 From ec4036edf924f741bc717d9afa25053cf63fa218 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 20 May 2022 08:00:05 -0400 Subject: KVM: x86/pmu: remove useless prototype Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 796ff839493b..f7bd4de14c92 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -191,7 +191,6 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); -void kvm_init_pmu_capability(void); bool is_vmware_backdoor_pmc(u32 pmc_idx); -- cgit v1.2.3 From c49467a45fe013ad7a892bf1479b1438315058f3 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 10 May 2022 12:44:07 +0800 Subject: KVM: x86/pmu: Don't overwrite the pmu->global_ctrl when refreshing Assigning a value to pmu->global_ctrl just to set the value of pmu->global_ctrl_mask is more readable but does not conform to the specification. The value is reset to zero on Power up and Reset but stays unchanged on INIT, like most other MSRs. Signed-off-by: Like Xu Message-Id: <20220510044407.26445-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index ca219a54a53e..1c6f2ca2beac 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -508,6 +508,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 counter_mask; int i; pmu->nr_arch_gp_counters = 0; @@ -559,9 +560,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) for (i = 0; i < pmu->nr_arch_fixed_counters; i++) pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); - pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; + counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + pmu->global_ctrl_mask = counter_mask; pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); @@ -596,7 +597,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) { vcpu->arch.ia32_misc_enable_msr &= ~MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) { - pmu->pebs_enable_mask = ~pmu->global_ctrl; + pmu->pebs_enable_mask = counter_mask; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { pmu->fixed_ctr_ctrl_mask &= -- cgit v1.2.3 From 98defd2e17803263f49548fea930cfc974d505aa Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 9 May 2022 18:22:02 +0800 Subject: KVM: x86/pmu: Ignore pmu->global_ctrl check if vPMU doesn't support global_ctrl MSR_CORE_PERF_GLOBAL_CTRL is introduced as part of Architecture PMU V2, as indicated by Intel SDM 19.2.2 and the intel_is_valid_msr() function. So in the absence of global_ctrl support, all PMCs are enabled as AMD does. Signed-off-by: Like Xu Message-Id: <20220509102204.62389-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 1c6f2ca2beac..2fc90080dcce 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -98,6 +98,9 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); + if (pmu->version < 2) + return true; + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } -- cgit v1.2.3 From bfb088d9fb5abdd3fbf00bae9abdfee8b92265aa Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Jun 2022 03:21:19 -0400 Subject: KVM: vmx, pmu: accept 0 for host-initiated write to MSR_IA32_DS_AREA Whenever an MSR is part of KVM_GET_MSR_INDEX_LIST, as is the case for MSR_IA32_DS_AREA, it has to be always settable with KVM_SET_MSR. Accept a zero value for these MSRs to obey the contract. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 2fc90080dcce..5bc7cfc753fc 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -443,6 +443,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_DS_AREA: + if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; if (is_noncanonical_address(data, vcpu)) return 1; pmu->ds_area = data; -- cgit v1.2.3 From d1c88a4020567ba4da52f778bcd9619d87e4ea75 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 25 May 2022 04:39:22 -0400 Subject: KVM: x86: always allow host-initiated writes to PMU MSRs Whenever an MSR is part of KVM_GET_MSR_INDEX_LIST, it has to be always retrievable and settable with KVM_GET_MSR and KVM_SET_MSR. Accept the PMU MSRs unconditionally in intel_is_valid_msr, if the access was host-initiated. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 4 ++-- arch/x86/kvm/pmu.h | 4 ++-- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 27 +++++++++++++++++---------- arch/x86/kvm/x86.c | 10 +++++----- 5 files changed, 27 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 1ed63ea19c00..d2320d1aeb9f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -458,10 +458,10 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) } } -bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) { return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || - static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); + static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr, host_initiated); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index f7bd4de14c92..1398297ae6dc 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -29,7 +29,7 @@ struct kvm_pmu_ops { unsigned int idx, u64 *mask); struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); - bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); + bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void (*refresh)(struct kvm_vcpu *vcpu); @@ -181,7 +181,7 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); -bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 136039fc6d01..0e5784371ac0 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -229,7 +229,7 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[idx]; } -static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) { /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ return false; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5bc7cfc753fc..8eca1321af7e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -195,38 +195,45 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) return ret; } -static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); u64 perf_capabilities = vcpu->arch.perf_capabilities; - int ret; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: case MSR_CORE_PERF_GLOBAL_STATUS: case MSR_CORE_PERF_GLOBAL_CTRL: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; + if (host_initiated) + return true; + return pmu->version > 1; break; case MSR_IA32_PEBS_ENABLE: - ret = perf_capabilities & PERF_CAP_PEBS_FORMAT; + if (host_initiated) + return true; + return perf_capabilities & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: - ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + if (host_initiated) + return true; + return guest_cpuid_has(vcpu, X86_FEATURE_DS); break; case MSR_PEBS_DATA_CFG: - ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + if (host_initiated) + return true; + return (perf_capabilities & PERF_CAP_PEBS_BASELINE) && ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; default: - ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || + if (host_initiated) + return true; + return get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) || intel_pmu_is_valid_lbr_msr(vcpu, msr); break; } - - return ret; } static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) @@ -589,7 +596,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); nested_vmx_pmu_refresh(vcpu, - intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); + intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, false)); if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 32218fac4504..c8dfdef9e52f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3719,7 +3719,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr)) + if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) @@ -3802,7 +3802,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr)) + if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) return kvm_pmu_set_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } @@ -3882,7 +3882,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) return kvm_pmu_get_msr(vcpu, msr_info); if (!msr_info->host_initiated) return 1; @@ -3892,7 +3892,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; @@ -4138,7 +4138,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) return kvm_pmu_get_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } -- cgit v1.2.3 From a33095f4937b362306f8636742450cff1c4630af Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:02 +0800 Subject: KVM: x86/pmu: Update comments for AMD gp counters The obsolete comment could more accurately state that AMD platforms have two base MSR addresses and two different maximum numbers for gp counters, depending on the X86_FEATURE_PERFCTR_CORE feature. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-2-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index d2320d1aeb9f..72512a33a04e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -44,7 +44,9 @@ static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { * However AMD doesn't support fixed-counters; * - There are three types of index to access perf counters (PMC): * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD - * has MSR_K7_PERFCTRn. + * has MSR_K7_PERFCTRn and, for families 15H and later, + * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are + * aliased to MSR_K7_PERFCTRn. * 2. MSR Index (named idx): This normally is used by RDPMC instruction. * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except @@ -56,7 +58,8 @@ static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { * between pmc and perf counters is as the following: * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed - * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters + * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H + * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters */ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; -- cgit v1.2.3 From 89cb454ea984d0411523dc10e70e9bf0aca1b527 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:03 +0800 Subject: KVM: x86/pmu: Extract check_pmu_event_filter() handling both GP and fixed counters Checking the kvm->arch.pmu_event_filter policy in both gp and fixed code paths was somewhat redundant, so common parts can be extracted, which reduces code footprint and improves readability. Signed-off-by: Like Xu Reviewed-by: Wanpeng Li Message-Id: <20220518132512.37864-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 63 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 72512a33a04e..ee6b2895faed 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -250,14 +250,44 @@ static int cmp_u64(const void *pa, const void *pb) return (a > b) - (a < b); } +static bool check_pmu_event_filter(struct kvm_pmc *pmc) +{ + struct kvm_pmu_event_filter *filter; + struct kvm *kvm = pmc->vcpu->kvm; + bool allow_event = true; + __u64 key; + int idx; + + filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); + if (!filter) + goto out; + + if (pmc_is_gp(pmc)) { + key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; + if (bsearch(&key, filter->events, filter->nevents, + sizeof(__u64), cmp_u64)) + allow_event = filter->action == KVM_PMU_EVENT_ALLOW; + else + allow_event = filter->action == KVM_PMU_EVENT_DENY; + } else { + idx = pmc->idx - INTEL_PMC_IDX_FIXED; + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; + } + +out: + return allow_event; +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { u64 config; u32 type = PERF_TYPE_RAW; - struct kvm *kvm = pmc->vcpu->kvm; - struct kvm_pmu_event_filter *filter; - struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); - bool allow_event = true; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -269,17 +299,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; - - if (bsearch(&key, filter->events, filter->nevents, - sizeof(__u64), cmp_u64)) - allow_event = filter->action == KVM_PMU_EVENT_ALLOW; - else - allow_event = filter->action == KVM_PMU_EVENT_DENY; - } - if (!allow_event) + if (!check_pmu_event_filter(pmc)) return; if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | @@ -312,23 +332,14 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) { unsigned en_field = ctrl & 0x3; bool pmi = ctrl & 0x8; - struct kvm_pmu_event_filter *filter; - struct kvm *kvm = pmc->vcpu->kvm; pmc_pause_counter(pmc); if (!en_field || !pmc_is_enabled(pmc)) return; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - } + if (!check_pmu_event_filter(pmc)) + return; if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) return; -- cgit v1.2.3 From a40239b4cf33b2de872b04759c3e5ab87cc72a7f Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:05 +0800 Subject: KVM: x86/pmu: Pass only "struct kvm_pmc *pmc" to reprogram_counter() Passing the reference "struct kvm_pmc *pmc" when creating pmc->perf_event is sufficient. This change helps to simplify the calling convention by replacing reprogram_{gp, fixed}_counter() with reprogram_counter() seamlessly. No functional change intended. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-5-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 17 +++++------------ arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 32 ++++++++++++++++++-------------- 3 files changed, 24 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index ee6b2895faed..817352d83ff4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -355,18 +355,13 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) } EXPORT_SYMBOL_GPL(reprogram_fixed_counter); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) +void reprogram_counter(struct kvm_pmc *pmc) { - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx); - - if (!pmc) - return; - if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc, pmc->eventsel); else { - int idx = pmc_idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); + int idx = pmc->idx - INTEL_PMC_IDX_FIXED; + u8 ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, idx); reprogram_fixed_counter(pmc, ctrl, idx); } @@ -385,8 +380,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) clear_bit(bit, pmu->reprogram_pmi); continue; } - - reprogram_counter(pmu, bit); + reprogram_counter(pmc); } /* @@ -559,13 +553,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 prev_count; prev_count = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - reprogram_counter(pmu, pmc->idx); + reprogram_counter(pmc); if (pmc->counter < prev_count) __kvm_perf_overflow(pmc, false); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 1398297ae6dc..1dd6be22cdb2 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -175,7 +175,7 @@ static inline void kvm_init_pmu_capability(void) void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); +void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 8eca1321af7e..719ae6c62a5a 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -56,16 +56,32 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmu->fixed_ctr_ctrl = data; } +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) { + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + } else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } +} + /* function is called when global control register has been updated. */ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) { int bit; u64 diff = pmu->global_ctrl ^ data; + struct kvm_pmc *pmc; pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_counter(pmu, bit); + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + if (pmc) + reprogram_counter(pmc); + } } static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) @@ -104,18 +120,6 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; - - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } -} - static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); -- cgit v1.2.3 From fb121aaf19cd5047a01599debbb85a2c15275727 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:06 +0800 Subject: KVM: x86/pmu: Drop "u64 eventsel" for reprogram_gp_counter() Because inside reprogram_gp_counter() it is bound to assign the requested eventel to pmc->eventsel, this assignment step can be moved forward, thus simplifying the passing of parameters to "struct kvm_pmc *pmc" only. No functional change intended. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-6-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 7 +++---- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/svm/pmu.c | 6 ++++-- arch/x86/kvm/vmx/pmu_intel.c | 3 ++- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 817352d83ff4..b001471fbf82 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -283,17 +283,16 @@ out: return allow_event; } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +void reprogram_gp_counter(struct kvm_pmc *pmc) { u64 config; u32 type = PERF_TYPE_RAW; struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 eventsel = pmc->eventsel; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); - pmc->eventsel = eventsel; - pmc_pause_counter(pmc); if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) @@ -358,7 +357,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmc *pmc) { if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc, pmc->eventsel); + reprogram_gp_counter(pmc); else { int idx = pmc->idx - INTEL_PMC_IDX_FIXED; u8 ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, idx); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 1dd6be22cdb2..b9a76dd98242 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -173,7 +173,7 @@ static inline void kvm_init_pmu_capability(void) KVM_PMC_MAX_FIXED); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); +void reprogram_gp_counter(struct kvm_pmc *pmc); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); void reprogram_counter(struct kvm_pmc *pmc); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 0e5784371ac0..a1fbb72d6fbb 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -286,8 +286,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { data &= ~pmu->reserved_bits; - if (data != pmc->eventsel) - reprogram_gp_counter(pmc, data); + if (data != pmc->eventsel) { + pmc->eventsel = data; + reprogram_gp_counter(pmc); + } return 0; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 719ae6c62a5a..61e14a5a247d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -492,7 +492,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { - reprogram_gp_counter(pmc, data); + pmc->eventsel = data; + reprogram_gp_counter(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) -- cgit v1.2.3 From 76d287b2342e1906e399fd19d6500013aa074a50 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:07 +0800 Subject: KVM: x86/pmu: Drop "u8 ctrl, int idx" for reprogram_fixed_counter() Since afrer reprogram_fixed_counter() is called, it's bound to assign the requested fixed_ctr_ctrl to pmu->fixed_ctr_ctrl, this assignment step can be moved forward (the stale value for diff is saved extra early), thus simplifying the passing of parameters. No functional change intended. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-7-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 13 ++++++------- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 14 +++++++------- 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b001471fbf82..4c354298e516 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -327,8 +327,11 @@ void reprogram_gp_counter(struct kvm_pmc *pmc) } EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) +void reprogram_fixed_counter(struct kvm_pmc *pmc) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + int idx = pmc->idx - INTEL_PMC_IDX_FIXED; + u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); unsigned en_field = ctrl & 0x3; bool pmi = ctrl & 0x8; @@ -358,12 +361,8 @@ void reprogram_counter(struct kvm_pmc *pmc) { if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc); - else { - int idx = pmc->idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, idx); - - reprogram_fixed_counter(pmc, ctrl, idx); - } + else + reprogram_fixed_counter(pmc); } EXPORT_SYMBOL_GPL(reprogram_counter); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index b9a76dd98242..fe31bbd1f906 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -174,7 +174,7 @@ static inline void kvm_init_pmu_capability(void) } void reprogram_gp_counter(struct kvm_pmc *pmc); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); +void reprogram_fixed_counter(struct kvm_pmc *pmc); void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 61e14a5a247d..13d54c5fd12b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -37,23 +37,23 @@ static int fixed_pmc_events[] = {1, 0, 7}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { + struct kvm_pmc *pmc; + u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; + pmu->fixed_ctr_ctrl = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); - u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); - struct kvm_pmc *pmc; - - pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); if (old_ctrl == new_ctrl) continue; + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_fixed_counter(pmc, new_ctrl, i); + reprogram_fixed_counter(pmc); } - - pmu->fixed_ctr_ctrl = data; } static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -- cgit v1.2.3 From e99fae6edebcdf53658f531ee3c913ca74536355 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 25 May 2022 05:28:56 -0400 Subject: KVM: x86/pmu: Use only the uniform interface reprogram_counter() Since reprogram_counter(), reprogram_{gp, fixed}_counter() currently have the same incoming parameter "struct kvm_pmc *pmc", the callers can simplify the conetxt by using uniformly exported interface, which makes reprogram_ {gp, fixed}_counter() static and eliminates EXPORT_SYMBOL_GPL. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-8-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 6 ++---- arch/x86/kvm/pmu.h | 2 -- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 4 files changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 4c354298e516..d2a0581d9d4d 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -283,7 +283,7 @@ out: return allow_event; } -void reprogram_gp_counter(struct kvm_pmc *pmc) +static void reprogram_gp_counter(struct kvm_pmc *pmc) { u64 config; u32 type = PERF_TYPE_RAW; @@ -325,9 +325,8 @@ void reprogram_gp_counter(struct kvm_pmc *pmc) !(eventsel & ARCH_PERFMON_EVENTSEL_OS), eventsel & ARCH_PERFMON_EVENTSEL_INT); } -EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc) +static void reprogram_fixed_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); int idx = pmc->idx - INTEL_PMC_IDX_FIXED; @@ -355,7 +354,6 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc) !(en_field & 0x1), /* exclude kernel */ pmi); } -EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmc *pmc) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index fe31bbd1f906..60faf27678d9 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -173,8 +173,6 @@ static inline void kvm_init_pmu_capability(void) KVM_PMC_MAX_FIXED); } -void reprogram_gp_counter(struct kvm_pmc *pmc); -void reprogram_fixed_counter(struct kvm_pmc *pmc); void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index a1fbb72d6fbb..79346def7c96 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -288,7 +288,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - reprogram_gp_counter(pmc); + reprogram_counter(pmc); } return 0; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 13d54c5fd12b..0dc270e6717c 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_fixed_counter(pmc); + reprogram_counter(pmc); } } @@ -493,7 +493,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { pmc->eventsel = data; - reprogram_gp_counter(pmc); + reprogram_counter(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) -- cgit v1.2.3 From 02791a5c362b7d71447efb1d0131d8368bb821f2 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:09 +0800 Subject: KVM: x86/pmu: Use PERF_TYPE_RAW to merge reprogram_{gp,fixed}counter() The code sketch for reprogram_{gp, fixed}_counter() is similar, while the fixed counter using the PERF_TYPE_HARDWAR type and the gp being able to use either PERF_TYPE_HARDWAR or PERF_TYPE_RAW type depending on the pmc->eventsel value. After 'commit 761875634a5e ("KVM: x86/pmu: Setup pmc->eventsel for fixed PMCs")', the pmc->eventsel of the fixed counter will also have been setup with the same semantic value and will not be changed during the guest runtime. The original story of using the PERF_TYPE_HARDWARE type is to emulate guest architecture PMU on a host without architecture PMU (the Pentium 4), for which the guest vPMC needs to be reprogrammed using the kernel generic perf_hw_id. But essentially, "the HARDWARE is just a convenience wrapper over RAW IIRC", quoated from Peterz. So it could be pretty safe to use the PERF_TYPE_RAW type only in practice to program both gp and fixed counters naturally in the reprogram_counter(). To make the gp and fixed counters more semantically symmetrical, the selection of EVENTSEL_{USER, OS, INT} bits is temporarily translated via fixed_ctr_ctrl before the pmc_reprogram_counter() call. Cc: Peter Zijlstra Suggested-by: Jim Mattson Signed-off-by: Like Xu Message-Id: <20220518132512.37864-9-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 79 +++++++++++++++--------------------------------------- 1 file changed, 21 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index d2a0581d9d4d..b46a96604fe6 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -283,85 +283,48 @@ out: return allow_event; } -static void reprogram_gp_counter(struct kvm_pmc *pmc) +void reprogram_counter(struct kvm_pmc *pmc) { - u64 config; - u32 type = PERF_TYPE_RAW; struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; - - if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) - printk_once("kvm pmu: pin control bit is ignored\n"); + u64 new_config = eventsel; + u8 fixed_ctr_ctrl; pmc_pause_counter(pmc); - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) + if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) return; if (!check_pmu_event_filter(pmc)) return; - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK | - HSW_IN_TX | - HSW_IN_TX_CHECKPOINTED))) { - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - if (config != PERF_COUNT_HW_MAX) - type = PERF_TYPE_HARDWARE; - } + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); - if (type == PERF_TYPE_RAW) - config = eventsel & pmu->raw_event_mask; + if (pmc_is_fixed(pmc)) { + fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); + if (fixed_ctr_ctrl & 0x1) + eventsel |= ARCH_PERFMON_EVENTSEL_OS; + if (fixed_ctr_ctrl & 0x2) + eventsel |= ARCH_PERFMON_EVENTSEL_USR; + if (fixed_ctr_ctrl & 0x8) + eventsel |= ARCH_PERFMON_EVENTSEL_INT; + new_config = (u64)fixed_ctr_ctrl; + } - if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) + if (pmc->current_config == new_config && pmc_resume_counter(pmc)) return; pmc_release_perf_event(pmc); - pmc->current_config = eventsel; - pmc_reprogram_counter(pmc, type, config, + pmc->current_config = new_config; + pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), eventsel & ARCH_PERFMON_EVENTSEL_INT); } - -static void reprogram_fixed_counter(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - int idx = pmc->idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); - unsigned en_field = ctrl & 0x3; - bool pmi = ctrl & 0x8; - - pmc_pause_counter(pmc); - - if (!en_field || !pmc_is_enabled(pmc)) - return; - - if (!check_pmu_event_filter(pmc)) - return; - - if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) - return; - - pmc_release_perf_event(pmc); - - pmc->current_config = (u64)ctrl; - pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc), - !(en_field & 0x2), /* exclude user */ - !(en_field & 0x1), /* exclude kernel */ - pmi); -} - -void reprogram_counter(struct kvm_pmc *pmc) -{ - if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc); - else - reprogram_fixed_counter(pmc); -} EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From dc852ff5bb419195c7d64cfcbe26f747490fca14 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:10 +0800 Subject: perf: x86/core: Add interface to query perfmon_event_map[] directly Currently, we have [intel|knc|p4|p6]_perfmon_event_map on the Intel platforms and amd_[f17h]_perfmon_event_map on the AMD platforms. Early clumsy KVM code or other potential perf_event users may have hard-coded these perfmon_maps (e.g., arch/x86/kvm/svm/pmu.c), so it would not make sense to program a common hardware event based on the generic "enum perf_hw_id" once the two tables do not match. Let's provide an interface for callers outside the perf subsystem to get the counter config based on the perfmon_event_map currently in use, and it also helps to save bytes. Cc: Peter Zijlstra Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220518132512.37864-10-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/core.c | 11 +++++++++++ arch/x86/include/asm/perf_event.h | 6 ++++++ 2 files changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 330825160b9a..2e16c268a005 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -3005,3 +3005,14 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); + +u64 perf_get_hw_event_config(int hw_event) +{ + int max = x86_pmu.max_events; + + if (hw_event < max) + return x86_pmu.event_map(array_index_nospec(hw_event, max)); + + return 0; +} +EXPORT_SYMBOL_GPL(perf_get_hw_event_config); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 58e2fcbb8bcc..cc47044401ff 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -505,6 +505,7 @@ struct x86_pmu_lbr { }; extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern u64 perf_get_hw_event_config(int hw_event); extern void perf_check_microcode(void); extern void perf_clear_dirty_counters(void); extern int x86_perf_rdpmc_index(struct perf_event *event); @@ -514,6 +515,11 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) memset(cap, 0, sizeof(*cap)); } +static inline u64 perf_get_hw_event_config(int hw_event) +{ + return 0; +} + static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif -- cgit v1.2.3 From 08dca7a8e73abfeb3a998714272d1d1c974b0190 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:11 +0800 Subject: KVM: x86/pmu: Replace pmc_perf_hw_id() with perf_get_hw_event_config() With the help of perf_get_hw_event_config(), KVM could query the correct EVENTSEL_{EVENT, UMASK} pair of a kernel-generic hw event directly from the different *_perfmon_event_map[] by the kernel's pre-defined perf_hw_id. Also extend the bit range of the comparison field to AMD64_RAW_EVENT_MASK_NB to prevent AMD from defining EventSelect[11:8] into perfmon_event_map[] one day. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-11-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b46a96604fe6..2843ce35c8d9 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -525,13 +525,8 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, unsigned int perf_hw_id) { - u64 old_eventsel = pmc->eventsel; - unsigned int config; - - pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - pmc->eventsel = old_eventsel; - return config == perf_hw_id; + return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & + AMD64_RAW_EVENT_MASK_NB); } static inline bool cpl_is_matched(struct kvm_pmc *pmc) -- cgit v1.2.3 From 7aadaa988c5ea0894b3bbea598e4da56f078a289 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:12 +0800 Subject: KVM: x86/pmu: Drop amd_event_mapping[] in the KVM context All gp or fixed counters have been reprogrammed using PERF_TYPE_RAW, which means that the table that maps perf_hw_id to event select values is no longer useful, at least for AMD. For Intel, the logic to check if the pmu event reported by Intel cpuid is not available is still required, in which case pmc_perf_hw_id() could be renamed to hw_event_is_unavail() and a bool value is returned to replace the semantics of "PERF_COUNT_HW_MAX+1". Signed-off-by: Like Xu Message-Id: <20220518132512.37864-12-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 2 +- arch/x86/kvm/pmu.c | 6 ++-- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/svm/pmu.c | 56 ++-------------------------------- arch/x86/kvm/vmx/pmu_intel.c | 11 +++---- 5 files changed, 12 insertions(+), 65 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index fdfd8e06fee6..c17e3e96fc1d 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -12,7 +12,7 @@ BUILD_BUG_ON(1) * a NULL definition, for example if "static_call_cond()" will be used * at the call sites. */ -KVM_X86_PMU_OP(pmc_perf_hw_id) +KVM_X86_PMU_OP(hw_event_available) KVM_X86_PMU_OP(pmc_is_enabled) KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 2843ce35c8d9..87483e503c46 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -158,9 +158,6 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, }; bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); - if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) - return; - attr.sample_period = get_sample_period(pmc, pmc->counter); if ((attr.config & HSW_IN_TX_CHECKPOINTED) && @@ -258,6 +255,9 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) __u64 key; int idx; + if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) + return false; + filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (!filter) goto out; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 60faf27678d9..0e719436b7b8 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -22,7 +22,7 @@ struct kvm_event_hw_type_mapping { }; struct kvm_pmu_ops { - unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); + bool (*hw_event_available)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 79346def7c96..256244b8f89c 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -33,34 +33,6 @@ enum index { INDEX_ERROR, }; -/* duplicated from amd_perfmon_event_map, K7 and above should work. */ -static struct kvm_event_hw_type_mapping amd_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* duplicated from amd_f17h_perfmon_event_map. */ -static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* amd_pmc_perf_hw_id depends on these being the same size */ -static_assert(ARRAY_SIZE(amd_event_mapping) == - ARRAY_SIZE(amd_f17h_event_mapping)); - static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); @@ -154,31 +126,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, return &pmu->gp_counters[msr_to_index(msr)]; } -static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool amd_hw_event_available(struct kvm_pmc *pmc) { - struct kvm_event_hw_type_mapping *event_mapping; - u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - int i; - - /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ - if (WARN_ON(pmc_is_fixed(pmc))) - return PERF_COUNT_HW_MAX; - - if (guest_cpuid_family(pmc->vcpu) >= 0x17) - event_mapping = amd_f17h_event_mapping; - else - event_mapping = amd_event_mapping; - - for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) - if (event_mapping[i].eventsel == event_select - && event_mapping[i].unit_mask == unit_mask) - break; - - if (i == ARRAY_SIZE(amd_event_mapping)) - return PERF_COUNT_HW_MAX; - - return event_mapping[i].event_type; + return true; } /* check if a PMC is enabled by comparing it against global_ctrl bits. Because @@ -345,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops amd_pmu_ops __initdata = { - .pmc_perf_hw_id = amd_pmc_perf_hw_id, + .hw_event_available = amd_hw_event_available, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 0dc270e6717c..5b85320fc9f1 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -84,7 +84,7 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) } } -static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool intel_hw_event_available(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; @@ -98,15 +98,12 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) /* disable event that reported as not present by cpuid */ if ((i < 7) && !(pmu->available_event_types & (1 << i))) - return PERF_COUNT_HW_MAX + 1; + return false; break; } - if (i == ARRAY_SIZE(intel_arch_events)) - return PERF_COUNT_HW_MAX; - - return intel_arch_events[i].event_type; + return true; } /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ @@ -814,7 +811,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) } struct kvm_pmu_ops intel_pmu_ops __initdata = { - .pmc_perf_hw_id = intel_pmc_perf_hw_id, + .hw_event_available = intel_hw_event_available, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, -- cgit v1.2.3 From ed2351174e38ad4febbbc0dba802803e6cff8ae0 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Tue, 24 May 2022 21:56:21 +0800 Subject: KVM: x86: Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault For the triple fault sythesized by KVM, e.g. the RSM path or nested_vmx_abort(), if KVM exits to userspace before the request is serviced, userspace could migrate the VM and lose the triple fault. Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault with a new event KVM_VCPUEVENT_VALID_FAULT_FAULT so that userspace can save and restore the triple fault event. This extension is guarded by a new KVM capability KVM_CAP_TRIPLE_FAULT_EVENT. Note that in the set_vcpu_events path, userspace is able to set/clear the triple fault request through triple_fault.pending field. Signed-off-by: Chenyi Qiang Message-Id: <20220524135624.22988-2-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 8 ++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/include/uapi/asm/kvm.h | 6 +++++- arch/x86/kvm/x86.c | 21 ++++++++++++++++++++- include/uapi/linux/kvm.h | 1 + 5 files changed, 36 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 421479a67da5..f67e367c4059 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1150,6 +1150,10 @@ The following bits are defined in the flags field: fields contain a valid state. This bit will be set whenever KVM_CAP_EXCEPTION_PAYLOAD is enabled. +- KVM_VCPUEVENT_VALID_TRIPLE_FAULT may be set to signal that the + triple_fault_pending field contains a valid state. This bit will + be set whenever KVM_CAP_TRIPLE_FAULT_EVENT is enabled. + ARM64: ^^^^^^ @@ -1245,6 +1249,10 @@ can be set in the flags field to signal that the exception_has_payload, exception_payload, and exception.pending fields contain a valid state and shall be written into the VCPU. +If KVM_CAP_TRIPLE_FAULT_EVENT is enabled, KVM_VCPUEVENT_VALID_TRIPLE_FAULT +can be set in flags field to signal that the triple_fault field contains +a valid state and shall be written into the VCPU. + ARM64: ^^^^^^ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 032278f0ee6d..d6c62276e131 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1174,6 +1174,8 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool triple_fault_event; + bool bus_lock_detection_enabled; bool enable_pmu; /* diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 21614807a2cb..24c807c8d5f7 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -325,6 +325,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 +#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -359,7 +360,10 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u8 reserved[27]; + struct { + __u8 pending; + } triple_fault; + __u8 reserved[26]; __u8 exception_has_payload; __u64 exception_payload; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8dfdef9e52f..422fbb0d7518 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4296,6 +4296,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: @@ -4942,6 +4943,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + if (vcpu->kvm->arch.triple_fault_event) { + events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); + events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + } memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -4955,7 +4960,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM - | KVM_VCPUEVENT_VALID_PAYLOAD)) + | KVM_VCPUEVENT_VALID_PAYLOAD + | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { @@ -5028,6 +5034,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, } } + if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + if (!vcpu->kvm->arch.triple_fault_event) + return -EINVAL; + if (events->triple_fault.pending) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + else + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -6029,6 +6044,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + kvm->arch.triple_fault_event = cap->args[0]; + r = 0; + break; case KVM_CAP_X86_USER_SPACE_MSR: kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c4a32910b88a..ca799319acfd 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1158,6 +1158,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SYSTEM_EVENT_DATA 215 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216 #define KVM_CAP_S390_PROTECTED_DUMP 217 +#define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 938c8745bcf2f732ee928a0b9bd592198a88cfa4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 May 2022 21:56:23 +0800 Subject: KVM: x86: Introduce "struct kvm_caps" to track misc caps/settings Add kvm_caps to hold a variety of capabilites and defaults that aren't handled by kvm_cpu_caps because they aren't CPUID bits in order to reduce the amount of boilerplate code required to add a new feature. The vast majority (all?) of the caps interact with vendor code and are written only during initialization, i.e. should be tagged __read_mostly, declared extern in x86.h, and exported. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220524135624.22988-4-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 15 ------- arch/x86/kvm/cpuid.c | 8 ++-- arch/x86/kvm/debugfs.c | 4 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm/nested.c | 4 +- arch/x86/kvm/svm/svm.c | 13 +++--- arch/x86/kvm/vmx/nested.c | 4 +- arch/x86/kvm/vmx/vmx.c | 22 +++++----- arch/x86/kvm/x86.c | 97 ++++++++++++++++++----------------------- arch/x86/kvm/x86.h | 26 +++++++++-- 10 files changed, 94 insertions(+), 101 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d6c62276e131..4e00bca08cfa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1664,21 +1664,6 @@ extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); -/* control of guest tsc rate supported? */ -extern bool kvm_has_tsc_control; -/* maximum supported tsc_khz for guests */ -extern u32 kvm_max_guest_tsc_khz; -/* number of bits of the fractional part of the TSC scaling ratio */ -extern u8 kvm_tsc_scaling_ratio_frac_bits; -/* maximum allowed value of TSC scaling ratio */ -extern u64 kvm_max_tsc_scaling_ratio; -/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ -extern u64 kvm_default_tsc_scaling_ratio; -/* bus lock detection supported? */ -extern bool kvm_has_bus_lock_exit; - -extern u64 kvm_mce_cap_supported; - /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 211f4566641e..d47222ab8e6e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -200,7 +200,7 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) /* * Calculate guest's supported XCR0 taking into account guest CPUID data and - * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0). + * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). */ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) { @@ -210,7 +210,7 @@ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) if (!best) return 0; - return (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, @@ -912,8 +912,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xd: { - u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm(); - u64 permitted_xss = supported_xss; + u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xss = kvm_caps.supported_xss; entry->eax &= permitted_xcr0; entry->ebx = xstate_required_size(permitted_xcr0, false); diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 9240b3b7f8dd..cfed36aba2f7 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -48,7 +48,7 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) { - *val = kvm_tsc_scaling_ratio_frac_bits; + *val = kvm_caps.tsc_scaling_ratio_frac_bits; return 0; } @@ -66,7 +66,7 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ debugfs_dentry, vcpu, &vcpu_timer_advance_ns_fops); - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { debugfs_create_file("tsc-scaling-ratio", 0444, debugfs_dentry, vcpu, &vcpu_tsc_scaling_fops); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 39b805666a18..e69b83708f05 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1603,7 +1603,7 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) * that __delay() uses delay_tsc whenever the hardware has TSC, thus * always for VMX enabled hardware. */ - if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) { __delay(min(guest_cycles, nsec_to_cycles(vcpu, timer_advance_ns))); } else { diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 688f86b9202a..f958a656ff29 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -674,7 +674,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); nested_svm_update_tsc_ratio_msr(vcpu); } @@ -1031,7 +1031,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5fda7e7102f2..750933ca6b9f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1285,7 +1285,7 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; - svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; if (sev_es_guest(vcpu->kvm)) sev_es_vcpu_reset(svm); @@ -4868,7 +4868,7 @@ static __init void svm_set_cpu_caps(void) { kvm_set_cpu_caps(); - supported_xss = 0; + kvm_caps.supported_xss = 0; /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { @@ -4944,7 +4944,8 @@ static __init int svm_hardware_setup(void) init_msrpm_offsets(); - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); @@ -4954,11 +4955,11 @@ static __init int svm_hardware_setup(void) tsc_scaling = false; } else { pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; } } - kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; + kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 32; tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f5cb18e00e78..5c5f4e3762f5 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2548,7 +2548,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_get_l2_tsc_multiplier(vcpu)); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); @@ -4610,7 +4610,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); if (vmx->nested.l1_tpr_threshold != -1) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 55a8578255cb..6d631941ac1a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1717,7 +1717,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) return vmcs12->tsc_multiplier; - return kvm_default_tsc_scaling_ratio; + return kvm_caps.default_tsc_scaling_ratio; } static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -7544,7 +7544,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - supported_xss = 0; + kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7685,9 +7685,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, + kvm_caps.tsc_scaling_ratio_frac_bits, vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; @@ -8064,8 +8064,8 @@ static __init int hardware_setup(void) } if (!cpu_has_vmx_mpx()) - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | - XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) @@ -8132,11 +8132,11 @@ static __init int hardware_setup(void) enable_ipiv = false; if (cpu_has_vmx_tsc_scaling()) - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 48; + kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ @@ -8193,7 +8193,7 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_mce_cap_supported |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_LMCE_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 422fbb0d7518..53e5f2ad2422 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,8 +87,11 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); + +struct kvm_caps kvm_caps __read_mostly = { + .supported_mce_cap = MCG_CTL_P | MCG_SER_P, +}; +EXPORT_SYMBOL_GPL(kvm_caps); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -151,19 +154,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); -bool __read_mostly kvm_has_bus_lock_exit; -EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); @@ -235,8 +225,6 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); -u64 __read_mostly supported_xss; -EXPORT_SYMBOL_GPL(supported_xss); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -309,8 +297,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { }; u64 __read_mostly host_xcr0; -u64 __read_mostly supported_xcr0; -EXPORT_SYMBOL_GPL(supported_xcr0); static struct kmem_cache *x86_emulator_cache; @@ -2345,12 +2331,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) /* Guest TSC same frequency as host TSC? */ if (!scale) { - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { + if (!kvm_caps.has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; @@ -2362,10 +2348,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) } /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; @@ -2383,7 +2369,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return -1; } @@ -2460,18 +2446,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * - * N equals to kvm_tsc_scaling_ratio_frac_bits. + * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); + return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); } u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; - if (ratio != kvm_default_tsc_scaling_ratio) + if (ratio != kvm_caps.default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; @@ -2498,11 +2484,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { u64 nested_offset; - if (l2_multiplier == kvm_default_tsc_scaling_ratio) + if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) nested_offset = l1_offset; else nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); nested_offset += l2_offset; return nested_offset; @@ -2511,9 +2497,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { - if (l2_multiplier != kvm_default_tsc_scaling_ratio) + if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) return mul_u64_u64_shr(l1_multiplier, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); return l1_multiplier; } @@ -2555,7 +2541,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli else vcpu->arch.tsc_scaling_ratio = l1_multiplier; - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) static_call(kvm_x86_write_tsc_multiplier)( vcpu, vcpu->arch.tsc_scaling_ratio); } @@ -2691,7 +2677,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); @@ -3104,7 +3090,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); @@ -3613,7 +3599,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data & ~supported_xss) + if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; kvm_update_cpuid_runtime(vcpu); @@ -4374,7 +4360,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_TSC_CONTROL: case KVM_CAP_VM_TSC_CONTROL: - r = kvm_has_tsc_control; + r = kvm_caps.has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; @@ -4396,7 +4382,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = sched_info_on(); break; case KVM_CAP_X86_BUS_LOCK_EXIT: - if (kvm_has_bus_lock_exit) + if (kvm_caps.has_bus_lock_exit) r = KVM_BUS_LOCK_DETECTION_OFF | KVM_BUS_LOCK_DETECTION_EXIT; else @@ -4405,7 +4391,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XSAVE2: { u64 guest_perm = xstate_get_guest_group_perm(); - r = xstate_required_size(supported_xcr0 & guest_perm, false); + r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false); if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; @@ -4443,7 +4429,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: - if (put_user(supported_xcr0, uaddr)) + if (put_user(kvm_caps.supported_xcr0, uaddr)) return -EFAULT; return 0; default: @@ -4520,8 +4506,8 @@ long kvm_arch_dev_ioctl(struct file *filp, } case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) + if (copy_to_user(argp, &kvm_caps.supported_mce_cap, + sizeof(kvm_caps.supported_mce_cap))) goto out; r = 0; break; @@ -4805,7 +4791,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; @@ -5111,7 +5097,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, guest_xsave->region, - supported_xcr0, &vcpu->arch.pkru); + kvm_caps.supported_xcr0, + &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -5616,8 +5603,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6061,7 +6048,7 @@ split_irqchip_unlock: (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) break; - if (kvm_has_bus_lock_exit && + if (kvm_caps.has_bus_lock_exit && cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) kvm->arch.bus_lock_detection_enabled = true; r = 0; @@ -6610,8 +6597,8 @@ set_pit2_out: r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -8774,7 +8761,7 @@ static void kvm_hyperv_tsc_notifier(void) /* TSC frequency always matches when on Hyper-V */ for_each_present_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - kvm_max_guest_tsc_khz = tsc_khz; + kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { __kvm_start_pvclock_update(kvm); @@ -9036,7 +9023,7 @@ int kvm_arch_init(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } if (pi_inject_timer == -1) @@ -11748,13 +11735,13 @@ int kvm_arch_hardware_setup(void *opaque) kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - supported_xss = 0; + kvm_caps.supported_xss = 0; #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); #undef __kvm_cpu_cap_has - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that * fit into a signed integer. @@ -11762,10 +11749,10 @@ int kvm_arch_hardware_setup(void *opaque) * be 1 on all machines. */ u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; } - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; kvm_init_msr_list(); return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 588792f00334..359d0454ad28 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,25 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +struct kvm_caps { + /* control of guest tsc rate supported? */ + bool has_tsc_control; + /* maximum supported tsc_khz for guests */ + u32 max_guest_tsc_khz; + /* number of bits of the fractional part of the TSC scaling ratio */ + u8 tsc_scaling_ratio_frac_bits; + /* maximum allowed value of TSC scaling ratio */ + u64 max_tsc_scaling_ratio; + /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */ + u64 default_tsc_scaling_ratio; + /* bus lock detection supported? */ + bool has_bus_lock_exit; + + u64 supported_mce_cap; + u64 supported_xcr0; + u64 supported_xss; +}; + void kvm_spurious_fault(void); #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ @@ -283,14 +302,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; -extern u64 supported_xcr0; extern u64 host_xss; -extern u64 supported_xss; + +extern struct kvm_caps kvm_caps; + extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { - return (supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) + return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) == (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); } -- cgit v1.2.3 From 2f4073e08f4cc5a41e35d777c240aaadd0257051 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Tue, 24 May 2022 21:56:24 +0800 Subject: KVM: VMX: Enable Notify VM exit There are cases that malicious virtual machines can cause CPU stuck (due to event windows don't open up), e.g., infinite loop in microcode when nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and IRQ) can be delivered. It leads the CPU to be unavailable to host or other VMs. VMM can enable notify VM exit that a VM exit generated if no event window occurs in VM non-root mode for a specified amount of time (notify window). Feature enabling: - The new vmcs field SECONDARY_EXEC_NOTIFY_VM_EXITING is introduced to enable this feature. VMM can set NOTIFY_WINDOW vmcs field to adjust the expected notify window. - Add a new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT so that user space can query and enable this feature in per-VM scope. The argument is a 64bit value: bits 63:32 are used for notify window, and bits 31:0 are for flags. Current supported flags: - KVM_X86_NOTIFY_VMEXIT_ENABLED: enable the feature with the notify window provided. - KVM_X86_NOTIFY_VMEXIT_USER: exit to userspace once the exits happen. - It's safe to even set notify window to zero since an internal hardware threshold is added to vmcs.notify_window. VM exit handling: - Introduce a vcpu state notify_window_exits to records the count of notify VM exits and expose it through the debugfs. - Notify VM exit can happen incident to delivery of a vector event. Allow it in KVM. - Exit to userspace unconditionally for handling when VM_CONTEXT_INVALID bit is set. Nested handling - Nested notify VM exits are not supported yet. Keep the same notify window control in vmcs02 as vmcs01, so that L1 can't escape the restriction of notify VM exits through launching L2 VM. Notify VM exit is defined in latest Intel Architecture Instruction Set Extensions Programming Reference, chapter 9.2. Co-developed-by: Xiaoyao Li Signed-off-by: Xiaoyao Li Signed-off-by: Tao Xu Co-developed-by: Chenyi Qiang Signed-off-by: Chenyi Qiang Message-Id: <20220524135624.22988-5-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 49 ++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 7 ++++++ arch/x86/include/asm/vmx.h | 7 ++++++ arch/x86/include/asm/vmxfeatures.h | 1 + arch/x86/include/uapi/asm/vmx.h | 4 +++- arch/x86/kvm/vmx/capabilities.h | 6 +++++ arch/x86/kvm/vmx/nested.c | 8 +++++++ arch/x86/kvm/vmx/vmx.c | 40 +++++++++++++++++++++++++++++-- arch/x86/kvm/x86.c | 22 ++++++++++++++++- arch/x86/kvm/x86.h | 7 ++++++ include/uapi/linux/kvm.h | 11 +++++++++ 11 files changed, 158 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index f67e367c4059..30e31a886422 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6557,6 +6557,26 @@ array field represents return values. The userspace should update the return values of SBI call before resuming the VCPU. For more details on RISC-V SBI spec refer, https://github.com/riscv/riscv-sbi-doc. +:: + + /* KVM_EXIT_NOTIFY */ + struct { + #define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) + __u32 flags; + } notify; + +Used on x86 systems. When the VM capability KVM_CAP_X86_NOTIFY_VMEXIT is +enabled, a VM exit generated if no event window occurs in VM non-root mode +for a specified amount of time. Once KVM_X86_NOTIFY_VMEXIT_USER is set when +enabling the cap, it would exit to userspace with the exit reason +KVM_EXIT_NOTIFY for further handling. The "flags" field contains more +detailed info. + +The valid value for 'flags' is: + + - KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid + in VMCS. It would run into unknown result if resume the target VM. + :: /* Fix the size of the union. */ @@ -7523,6 +7543,35 @@ if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as the maximum APIC ID. +7.33 KVM_CAP_X86_NOTIFY_VMEXIT +------------------------------ + +:Architectures: x86 +:Target: VM +:Parameters: args[0] is the value of notify window as well as some flags +:Returns: 0 on success, -EINVAL if args[0] contains invalid flags or notify + VM exit is unsupported. + +Bits 63:32 of args[0] are used for notify window. +Bits 31:0 of args[0] are for some flags. Valid bits are:: + + #define KVM_X86_NOTIFY_VMEXIT_ENABLED (1 << 0) + #define KVM_X86_NOTIFY_VMEXIT_USER (1 << 1) + +This capability allows userspace to configure the notify VM exit on/off +in per-VM scope during VM creation. Notify VM exit is disabled by default. +When userspace sets KVM_X86_NOTIFY_VMEXIT_ENABLED bit in args[0], VMM will +enable this feature with the notify window provided, which will generate +a VM exit if no event window occurs in VM non-root mode for a specified of +time (notify window). + +If KVM_X86_NOTIFY_VMEXIT_USER is set in args[0], upon notify VM exits happen, +KVM would exit to userspace for handling. + +This capability is aimed to mitigate the threat that malicious VMs can +cause CPU stuck (due to event windows don't open up) and make the CPU +unavailable to host or other VMs. + 8. Other capabilities. ====================== diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4e00bca08cfa..6cf5d77d7896 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -65,6 +65,9 @@ #define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ KVM_BUS_LOCK_DETECTION_EXIT) +#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ + KVM_X86_NOTIFY_VMEXIT_USER) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -1178,6 +1181,9 @@ struct kvm_arch { bool bus_lock_detection_enabled; bool enable_pmu; + + u32 notify_window; + u32 notify_vmexit_flags; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace @@ -1325,6 +1331,7 @@ struct kvm_vcpu_stat { u64 directed_yield_attempted; u64 directed_yield_successful; u64 guest_mode; + u64 notify_window_exits; }; struct x86_instruction_info; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 89d2172787c5..c371ef695fcc 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -75,6 +75,7 @@ #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) #define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) +#define SECONDARY_EXEC_NOTIFY_VM_EXITING VMCS_CONTROL_BIT(NOTIFY_VM_EXITING) /* * Definitions of Tertiary Processor-Based VM-Execution Controls. @@ -280,6 +281,7 @@ enum vmcs_field { SECONDARY_VM_EXEC_CONTROL = 0x0000401e, PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, + NOTIFY_WINDOW = 0x00004024, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -564,6 +566,11 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT) #define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) +/* + * Exit Qualifications for NOTIFY VM EXIT + */ +#define NOTIFY_VM_CONTEXT_INVALID BIT(0) + /* * VM-instruction error numbers */ diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 589608c157bf..c6a7eed03914 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -85,6 +85,7 @@ #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ #define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ +#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* VM-Exit when no event windows after notify window */ /* Tertiary Processor-Based VM-Execution Controls, word 3 */ #define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 946d761adbd3..a5faf6d88f1b 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -91,6 +91,7 @@ #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 +#define EXIT_REASON_NOTIFY 75 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -153,7 +154,8 @@ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ - { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ + { EXIT_REASON_NOTIFY, "NOTIFY" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index f14c4bef97e0..2d3f13b18714 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -436,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void) return debugctl; } +static inline bool cpu_has_notify_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_NOTIFY_VM_EXITING; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5c5f4e3762f5..7d8cd0ebcc75 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2133,6 +2133,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + /* * If vmcs02 hasn't been initialized, set the constant vmcs02 state * according to L0's settings (vmcs12 is irrelevant here). Host @@ -2175,6 +2177,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + /* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based @@ -6112,6 +6117,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); case EXIT_REASON_ENCLS: return nested_vmx_exit_handled_encls(vcpu, vmcs12); + case EXIT_REASON_NOTIFY: + /* Notify VM exit is not exposed to L1 */ + return false; default: return true; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6d631941ac1a..2e00890d752a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2499,7 +2499,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_BUS_LOCK_DETECTION; + SECONDARY_EXEC_BUS_LOCK_DETECTION | + SECONDARY_EXEC_NOTIFY_VM_EXITING; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -4417,6 +4418,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + if (!kvm_notify_vmexit_enabled(vcpu->kvm)) + exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; + return exec_control; } @@ -4498,6 +4502,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->ple_window_dirty = true; } + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -5784,6 +5791,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) return 1; } +static int handle_notify(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; + + ++vcpu->stat.notify_window_exits; + + /* + * Notify VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || + context_invalid) { + vcpu->run->exit_reason = KVM_EXIT_NOTIFY; + vcpu->run->notify.flags = context_invalid ? + KVM_NOTIFY_CONTEXT_INVALID : 0; + return 0; + } + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5841,6 +5874,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, + [EXIT_REASON_NOTIFY] = handle_notify, }; static const int kvm_vmx_max_exit_handlers = @@ -6214,7 +6248,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && - exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + exit_reason.basic != EXIT_REASON_TASK_SWITCH && + exit_reason.basic != EXIT_REASON_NOTIFY)) { int ndata = 3; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -8137,6 +8172,7 @@ static __init int hardware_setup(void) kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; kvm_caps.tsc_scaling_ratio_frac_bits = 48; kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53e5f2ad2422..a8014233fd57 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -284,7 +284,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, nested_run), STATS_DESC_COUNTER(VCPU, directed_yield_attempted), STATS_DESC_COUNTER(VCPU, directed_yield_successful), - STATS_DESC_ICOUNTER(VCPU, guest_mode) + STATS_DESC_ICOUNTER(VCPU, guest_mode), + STATS_DESC_COUNTER(VCPU, notify_window_exits), }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -4402,6 +4403,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; + break; default: break; } @@ -6125,6 +6129,22 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = -EINVAL; + if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) + break; + if (!kvm_caps.has_notify_vmexit) + break; + if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) + break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.notify_window = cap->args[0] >> 32; + kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 359d0454ad28..501b884b8cc4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -21,6 +21,8 @@ struct kvm_caps { u64 default_tsc_scaling_ratio; /* bus lock detection supported? */ bool has_bus_lock_exit; + /* notify VM exit supported? */ + bool has_notify_vmexit; u64 supported_mce_cap; u64 supported_xcr0; @@ -364,6 +366,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } +static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) +{ + return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; +} + enum kvm_intr_type { /* Values are arbitrary, but must be non-zero. */ KVM_HANDLING_IRQ = 1, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ca799319acfd..7569b4ec199c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -270,6 +270,7 @@ struct kvm_xen_exit { #define KVM_EXIT_X86_BUS_LOCK 33 #define KVM_EXIT_XEN 34 #define KVM_EXIT_RISCV_SBI 35 +#define KVM_EXIT_NOTIFY 36 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -496,6 +497,11 @@ struct kvm_run { unsigned long args[6]; unsigned long ret[2]; } riscv_sbi; + /* KVM_EXIT_NOTIFY */ + struct { +#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) + __u32 flags; + } notify; /* Fix the size of the union. */ char padding[256]; }; @@ -1159,6 +1165,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SYSTEM_SUSPEND 216 #define KVM_CAP_S390_PROTECTED_DUMP 217 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 +#define KVM_CAP_X86_NOTIFY_VMEXIT 219 #ifdef KVM_CAP_IRQ_ROUTING @@ -2174,4 +2181,8 @@ struct kvm_stats_desc { /* Available with KVM_CAP_S390_PROTECTED_DUMP */ #define KVM_S390_PV_CPU_COMMAND _IOWR(KVMIO, 0xd0, struct kvm_pv_cmd) +/* Available with KVM_CAP_X86_NOTIFY_VMEXIT */ +#define KVM_X86_NOTIFY_VMEXIT_ENABLED (1ULL << 0) +#define KVM_X86_NOTIFY_VMEXIT_USER (1ULL << 1) + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From 92d80178a35b1ab04c4a8250a06399150dbf0b6a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 17 May 2022 11:40:45 -0400 Subject: perf/x86/intel: Fix the comment about guest LBR support on KVM Starting from v5.12, KVM reports guest LBR and extra_regs support when the host has relevant support. Just delete this part of the comment and fix a typo incidentally. Cc: Peter Zijlstra Reviewed-by: Kan Liang Reviewed-by: Andi Kleen Signed-off-by: Like Xu Signed-off-by: Yang Weijiang Message-Id: <20220517154100.29983-2-weijiang.yang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/intel/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 39832a5e7d75..4e9b7af9cc45 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6501,8 +6501,7 @@ __init int intel_pmu_init(void) x86_pmu.intel_ctrl); /* * Access LBR MSR may cause #GP under certain circumstances. - * E.g. KVM doesn't support LBR MSR - * Check all LBT MSR here. + * Check all LBR MSR here. * Disable LBR access if any LBR MSRs can not be accessed. */ if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL)) -- cgit v1.2.3 From 916e3a4f950eac92c28cc138c10d86514ffebf98 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Jun 2022 05:45:17 -0400 Subject: x86: events: Do not return bogus capabilities if PMU is broken If the PMU is broken due to firmware issues, check_hw_exists() will return false but perf_get_x86_pmu_capability() will still return data from x86_pmu. Likewise if some of the hotplug callbacks cannot be installed the contents of x86_pmu will not be reverted. Handle the failure in both cases by clearing x86_pmu if init_hw_perf_events() or reverts to software events only. Co-developed-by: Like Xu Signed-off-by: Like Xu Signed-off-by: Paolo Bonzini --- arch/x86/events/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 2e16c268a005..f969410d0c90 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2103,14 +2103,15 @@ static int __init init_hw_perf_events(void) } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return 0; + err = 0; + goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) - return 0; + goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -2219,6 +2220,8 @@ out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); +out_bad_pmu: + memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); @@ -2990,6 +2993,11 @@ unsigned long perf_misc_flags(struct pt_regs *regs) void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { + if (!x86_pmu_initialized()) { + memset(cap, 0, sizeof(*cap)); + return; + } + cap->version = x86_pmu.version; /* * KVM doesn't support the hybrid PMU yet. -- cgit v1.2.3 From d7808f739162c003d249168bfe4c571aba18fb8a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 19 May 2022 01:01:18 +0800 Subject: KVM: x86/pmu: Update global enable_pmu when PMU is undetected On some virt platforms (L1 guest w/o PMU), the value of module parameter 'enable_pmu' for nested L2 guests should be updated at initialisation. Considering that there is no concept of "architecture pmu" in AMD or Hygon and that the versions (prior to Zen 4) are all 0, but that the theoretical available counters are at least AMD64_NUM_COUNTERS, the utility check_hw_exists() is reused in the initialisation call path. Opportunistically update Intel specific comments. Fixes: 8eeac7e999e8 ("KVM: x86/pmu: Add kvm_pmu_cap to optimize perf_get_x86_pmu_capability") Signed-off-by: Like Xu Message-Id: <20220518170118.66263-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 0e719436b7b8..d59e1cb3b5dc 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -159,14 +159,19 @@ extern struct x86_pmu_capability kvm_pmu_cap; static inline void kvm_init_pmu_capability(void) { + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + perf_get_x86_pmu_capability(&kvm_pmu_cap); - /* - * Only support guest architectural pmu on - * a host with architectural pmu. - */ - if (!kvm_pmu_cap.version) + /* + * For Intel, only support guest architectural pmu + * on a host with architectural pmu. + */ + if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) { memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + enable_pmu = false; + return; + } kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, -- cgit v1.2.3 From b9181c8ef35636152facc72f801f27b4264df8c0 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 1 Jun 2022 11:19:25 +0800 Subject: KVM: x86/pmu: Avoid exposing Intel BTS feature The BTS feature (including the ability to set the BTS and BTINT bits in the DEBUGCTL MSR) is currently unsupported on KVM. But we may try using the BTS facility on a PEBS enabled guest like this: perf record -e branches:u -c 1 -d ls and then we would encounter the following call trace: [] unchecked MSR access error: WRMSR to 0x1d9 (tried to write 0x00000000000003c0) at rIP: 0xffffffff810745e4 (native_write_msr+0x4/0x20) [] Call Trace: [] intel_pmu_enable_bts+0x5d/0x70 [] bts_event_add+0x54/0x70 [] event_sched_in+0xee/0x290 As it lacks any CPUID indicator or perf_capabilities valid bit fields to prompt for this information, the platform would hint the Intel BTS feature unavailable to guest by setting the BTS_UNAVAIL bit in the IA32_MISC_ENABLE. Signed-off-by: Like Xu Message-Id: <20220601031925.59693-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.h | 3 +++ arch/x86/kvm/vmx/pmu_intel.c | 3 ++- arch/x86/kvm/x86.c | 6 +++--- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index d59e1cb3b5dc..2c5b3c9f5531 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -8,6 +8,9 @@ #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) +#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5b85320fc9f1..422f0a6562ac 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -536,6 +536,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; + vcpu->arch.ia32_misc_enable_msr |= MSR_IA32_MISC_ENABLE_PMU_RO_MASK; + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry || !vcpu->kvm->arch.enable_pmu) return; @@ -623,7 +625,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ~((1ull << pmu->nr_arch_gp_counters) - 1); } } else { - vcpu->arch.ia32_misc_enable_msr |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; vcpu->arch.perf_capabilities &= ~PERF_CAP_PEBS_MASK; } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a8014233fd57..79efdc19b4c8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3548,12 +3548,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_MISC_ENABLE: { u64 old_val = vcpu->arch.ia32_misc_enable_msr; - u64 pmu_mask = MSR_IA32_MISC_ENABLE_EMON | - MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; + u64 pmu_mask = MSR_IA32_MISC_ENABLE_PMU_RO_MASK | + MSR_IA32_MISC_ENABLE_EMON; /* RO bits */ if (!msr_info->host_initiated && - ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)) return 1; /* -- cgit v1.2.3 From 6ef25aa0a961298278301ae1d88106c701eb73fa Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 1 Jun 2022 11:19:24 +0800 Subject: KVM: x86/pmu: Restrict advanced features based on module enable_pmu Once vPMU is disabled, the KVM would not expose features like: PEBS (via clear kvm_pmu_cap.pebs_ept), legacy LBR and ARCH_LBR, CPUID 0xA leaf, PDCM bit and MSR_IA32_PERF_CAPABILITIES, plus PT_MODE_HOST_GUEST mode. What this group of features has in common is that their use relies on the underlying PMU counter and the host perf_event as a back-end resource requester or sharing part of the irq delivery path. Signed-off-by: Like Xu Message-Id: <20220601031925.59693-2-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.h | 6 ++++-- arch/x86/kvm/vmx/capabilities.h | 4 ++++ arch/x86/kvm/vmx/vmx.c | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 2c5b3c9f5531..c1b61671ba1e 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -170,9 +170,11 @@ static inline void kvm_init_pmu_capability(void) * For Intel, only support guest architectural pmu * on a host with architectural pmu. */ - if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) { - memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) enable_pmu = false; + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); return; } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 2d3f13b18714..292e58679d95 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -7,6 +7,7 @@ #include "lapic.h" #include "x86.h" #include "pmu.h" +#include "cpuid.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -409,6 +410,9 @@ static inline u64 vmx_get_perf_capabilities(void) u64 perf_cap = PMU_CAP_FW_WRITES; u64 host_perf_cap = 0; + if (!enable_pmu) + return 0; + if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2e00890d752a..83eeecb4c7f7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7568,6 +7568,9 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); } + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); + if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); @@ -8233,7 +8236,7 @@ static __init int hardware_setup(void) if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; - if (!enable_ept || !cpu_has_vmx_intel_pt()) + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; -- cgit v1.2.3 From 8e6a58e28b34e8d247e772159b8fa8f6bae39192 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 1 Jun 2022 11:19:23 +0800 Subject: KVM: x86/pmu: Accept 0 for absent PMU MSRs when host-initiated if !enable_pmu Whenever an MSR is part of KVM_GET_MSR_INDEX_LIST, as is the case for MSR_K7_EVNTSEL0 or MSR_F15H_PERF_CTL0, it has to be always retrievable and settable with KVM_GET_MSR and KVM_SET_MSR. Accept a zero value for these MSRs to obey the contract. Signed-off-by: Like Xu Message-Id: <20220601031925.59693-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 8 ++++++++ arch/x86/kvm/svm/pmu.c | 11 ++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 87483e503c46..6a32092460d3 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -442,11 +442,19 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + if (msr_info->host_initiated && !vcpu->kvm->arch.enable_pmu) { + msr_info->data = 0; + return 0; + } + return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + if (msr_info->host_initiated && !vcpu->kvm->arch.enable_pmu) + return !!msr_info->data; + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 256244b8f89c..fe520b2649b5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -182,7 +182,16 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) { /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ - return false; + if (!host_initiated) + return false; + + switch (msr) { + case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + return true; + default: + return false; + } } static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) -- cgit v1.2.3 From f5a81d0eb01e0dfebd175edffa7d0a1bdb74d026 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 May 2022 17:06:57 +0000 Subject: KVM: VMX: Sanitize VM-Entry/VM-Exit control pairs at kvm_intel load time Sanitize the VM-Entry/VM-Exit control pairs (load+load or load+clear) during setup instead of checking both controls in a pair at runtime. If only one control is supported, KVM will report the associated feature as not available, but will leave the supported control bit set in the VMCS config, which could lead to corruption of host state. E.g. if only the VM-Entry control is supported and the feature is not dynamically toggled, KVM will set the control in all VMCSes and load zeros without restoring host state. Note, while this is technically a bug fix, practically speaking no sane CPU or VMM would support only one control. KVM's behavior of checking both controls is mostly pedantry. Cc: Chenyi Qiang Cc: Lei Wang Signed-off-by: Sean Christopherson Message-Id: <20220527170658.3571367-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/capabilities.h | 10 +++------- arch/x86/kvm/vmx/vmx.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 292e58679d95..069d8d298e1d 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -98,20 +98,17 @@ static inline bool cpu_has_vmx_posted_intr(void) static inline bool cpu_has_load_ia32_efer(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER; } static inline bool cpu_has_load_perf_global_ctrl(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; } static inline bool cpu_has_vmx_mpx(void) { - return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && - (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; } static inline bool cpu_has_vmx_tpr_shadow(void) @@ -378,7 +375,6 @@ static inline bool cpu_has_vmx_intel_pt(void) rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && - (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 83eeecb4c7f7..bc1a0baa5f60 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2446,6 +2446,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + int i; + + /* + * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. + * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always + * intercepts writes to PAT and EFER, i.e. never enables those controls. + */ + struct { + u32 entry_control; + u32 exit_control; + } const vmcs_entry_exit_pairs[] = { + { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, + { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, + { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, + { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, + { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); min = CPU_BASED_HLT_EXITING | @@ -2586,6 +2603,20 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control) < 0) return -EIO; + for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { + u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; + u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; + + if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) + continue; + + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", + _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + + _vmentry_control &= ~n_ctrl; + _vmexit_control &= ~x_ctrl; + } + /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they * can't be used due to an errata where VM Exit may incorrectly clear -- cgit v1.2.3 From 3dbec44d9c94d8350a39326561ac40f969c63d16 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 May 2022 17:06:58 +0000 Subject: KVM: VMX: Reject kvm_intel if an inconsistent VMCS config is detected Add an on-by-default module param, error_on_inconsistent_vmcs_config, to allow rejecting the load of kvm_intel if an inconsistent VMCS config is detected. Continuing on with an inconsistent, degraded config is undesirable in the vast majority of use cases, e.g. may result in a misconfigured VM, poor performance due to lack of fast MSR switching, or even security issues in the unlikely event the guest is relying on MPX. Practically speaking, an inconsistent VMCS config should never be encountered in a production quality environment, e.g. on bare metal it indicates a silicon defect (or a disturbing lack of validation by the hardware vendor), and in a virtualized machine (KVM as L1) it indicates a buggy/misconfigured L0 VMM/hypervisor. Provide a module param to override the behavior for testing purposes, or in the unlikely scenario that KVM is deployed on a flawed-but-usable CPU or virtual machine. Note, what is or isn't an inconsistency is somewhat subjective, e.g. one might argue that LOAD_EFER without SAVE_EFER is an inconsistency. KVM's unofficial guideline for an "inconsistency" is either scenarios that are completely nonsensical, e.g. the existing checks on having EPT/VPID knobs without EPT/VPID, and/or scenarios that prevent KVM from virtualizing or utilizing a feature, e.g. the unpaired entry/exit controls checks. Other checks that fall into one or both of the covered scenarios could be added in the future, e.g. asserting that a VMCS control exists available if and only if the associated feature is supported in bare metal. Signed-off-by: Sean Christopherson Message-Id: <20220527170658.3571367-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bc1a0baa5f60..b959fe24c13b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -119,6 +119,9 @@ module_param(nested, bool, S_IRUGO); bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +static bool __read_mostly error_on_inconsistent_vmcs_config = true; +module_param(error_on_inconsistent_vmcs_config, bool, 0444); + static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); @@ -2547,15 +2550,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); } else if (vmx_cap->ept) { - vmx_cap->ept = 0; pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->ept = 0; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_cap->vpid) { - vmx_cap->vpid = 0; + vmx_cap->vpid) { pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->vpid = 0; } if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { @@ -2613,6 +2624,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + if (error_on_inconsistent_vmcs_config) + return -EIO; + _vmentry_control &= ~n_ctrl; _vmexit_control &= ~x_ctrl; } -- cgit v1.2.3 From b172862241b4849985c3e0e86cfb05d61e4a841d Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Tue, 31 May 2022 13:44:21 +0100 Subject: KVM: x86: PIT: Preserve state of speaker port data bit Currently the state of the speaker port (0x61) data bit (bit 1) is not saved in the exported state (kvm_pit_state2) and hence is lost when re-constructing guest state. This patch removes the 'speaker_data_port' field from kvm_kpit_state and instead tracks the state using a new KVM_PIT_FLAGS_SPEAKER_DATA_ON flag defined in the API. Signed-off-by: Paul Durrant Message-Id: <20220531124421.1427-1-pdurrant@amazon.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 4 +++- arch/x86/include/uapi/asm/kvm.h | 3 ++- arch/x86/kvm/i8254.c | 10 +++++++--- arch/x86/kvm/i8254.h | 1 - 4 files changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 30e31a886422..9cbbfdb663b6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3006,7 +3006,9 @@ KVM_CREATE_PIT2. The state is returned in the following structure:: Valid flags are:: /* disable PIT in HPET legacy mode */ - #define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + #define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + /* speaker port data bit enabled */ + #define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002 This IOCTL replaces the obsolete KVM_GET_PIT. diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 24c807c8d5f7..50a4e787d5e6 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -306,7 +306,8 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002 struct kvm_pit_state2 { struct kvm_pit_channel_state channels[3]; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 1c83076091af..e0a7a0e7a73c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -591,7 +591,10 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu, return -EOPNOTSUPP; mutex_lock(&pit_state->lock); - pit_state->speaker_data_on = (val >> 1) & 1; + if (val & (1 << 1)) + pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON; + else + pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON; pit_set_gate(pit, 2, val & 1); mutex_unlock(&pit_state->lock); return 0; @@ -612,8 +615,9 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | - (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); + ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) | + pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) | + (refresh_clock << 4); if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 394d9527da7e..a768212ba821 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -29,7 +29,6 @@ struct kvm_kpit_state { bool is_periodic; s64 period; /* unit: ns */ struct hrtimer timer; - u32 speaker_data_on; struct mutex lock; atomic_t reinject; -- cgit v1.2.3 From 61d9c412d0416aa1f7914a732d424a9e8ff24c36 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 May 2022 21:08:10 +0000 Subject: KVM: x86: Grab regs_dirty in local 'unsigned long' Capture ctxt->regs_dirty in a local 'unsigned long' instead of casting it to an 'unsigned long *' for use in for_each_set_bit(). The bitops helpers really do read the entire 'unsigned long', even though the walking of the read value is capped at the specified size. I.e. 64-bit KVM is reading memory beyond ctxt->regs_dirty, which is a u32 and thus 4 bytes, whereas an unsigned long is 8 bytes. Functionally it's not an issue because regs_dirty is in the middle of x86_emulate_ctxt, i.e. KVM is just reading its own memory, but relying on that coincidence is gross and unsafe. Reviewed-by: Vitaly Kuznetsov Reviewed-by: Kees Cook Signed-off-by: Sean Christopherson Message-Id: <20220526210817.3428868-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 89b11e7dca8a..7226a127ccb4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -269,9 +269,10 @@ static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) static void writeback_registers(struct x86_emulate_ctxt *ctxt) { + unsigned long dirty = ctxt->regs_dirty; unsigned reg; - for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) + for_each_set_bit(reg, &dirty, 16) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); } -- cgit v1.2.3 From dfe21e6bc05af433308bc1842da28a8fe28faaa4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 May 2022 21:08:11 +0000 Subject: KVM: x86: Harden _regs accesses to guard against buggy input WARN and truncate the incoming GPR number/index when reading/writing GPRs in the emulator to guard against KVM bugs, e.g. to avoid out-of-bounds accesses to ctxt->_regs[] if KVM generates a bogus index. Truncate the index instead of returning e.g. zero, as reg_write() returns a pointer to the register, i.e. returning zero would result in a NULL pointer dereference. KVM could also force the index to any arbitrary GPR, but that's no better or worse, just different. Open code the restriction to 16 registers; RIP is handled via _eip and should never be accessed through reg_read() or reg_write(). See the comments above the declarations of reg_read() and reg_write(), and the behavior of writeback_registers(). The horrific open coded mess will be cleaned up in a future commit. There are no such bugs known to exist in the emulator, but determining that KVM is bug-free is not at all simple and requires a deep dive into the emulator. The code is so convoluted that GCC-12 with the recently enable -Warray-bounds spits out a false-positive due to a GCC bug: arch/x86/kvm/emulate.c:254:27: warning: array subscript 32 is above array bounds of 'long unsigned int[17]' [-Warray-bounds] 254 | return ctxt->_regs[nr]; | ~~~~~~~~~~~^~~~ In file included from arch/x86/kvm/emulate.c:23: arch/x86/kvm/kvm_emulate.h: In function 'reg_rmw': arch/x86/kvm/kvm_emulate.h:366:23: note: while referencing '_regs' 366 | unsigned long _regs[NR_VCPU_REGS]; | ^~~~~ Link: https://lore.kernel.org/all/YofQlBrlx18J7h9Y@google.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=216026 Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105679 Reported-and-tested-by: Robert Dinse Reported-by: Kees Cook Reviewed-by: Kees Cook Signed-off-by: Sean Christopherson Message-Id: <20220526210817.3428868-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7226a127ccb4..c58366ae4da2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -247,6 +247,9 @@ enum x86_transfer_type { static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (WARN_ON_ONCE(nr >= 16)) + nr &= 16 - 1; + if (!(ctxt->regs_valid & (1 << nr))) { ctxt->regs_valid |= 1 << nr; ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); @@ -256,6 +259,9 @@ static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (WARN_ON_ONCE(nr >= 16)) + nr &= 16 - 1; + ctxt->regs_valid |= 1 << nr; ctxt->regs_dirty |= 1 << nr; return &ctxt->_regs[nr]; -- cgit v1.2.3 From a5ba67b42f07952ec45755bbdd66d7c6e49f555c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 May 2022 21:08:12 +0000 Subject: KVM: x86: Omit VCPU_REGS_RIP from emulator's _regs array Omit RIP from the emulator's _regs array, which is used only for GPRs, i.e. registers that can be referenced via ModRM and/or SIB bytes. The emulator uses the dedicated _eip field for RIP, and manually reads from _eip to handle RIP-relative addressing. To avoid an even bigger, slightly more dangerous change, hardcode the number of GPRs to 16 for the time being even though 32-bit KVM's emulator technically should only have 8 GPRs. Add a TODO to address that in a future commit. See also the comments above the read_gpr() and write_gpr() declarations, and obviously the handling in writeback_registers(). No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Kees Cook Message-Id: <20220526210817.3428868-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 10 +++++----- arch/x86/kvm/kvm_emulate.h | 13 ++++++++++++- 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c58366ae4da2..c74c0fd3b860 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -247,8 +247,8 @@ enum x86_transfer_type { static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { - if (WARN_ON_ONCE(nr >= 16)) - nr &= 16 - 1; + if (WARN_ON_ONCE(nr >= NR_EMULATOR_GPRS)) + nr &= NR_EMULATOR_GPRS - 1; if (!(ctxt->regs_valid & (1 << nr))) { ctxt->regs_valid |= 1 << nr; @@ -259,8 +259,8 @@ static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) { - if (WARN_ON_ONCE(nr >= 16)) - nr &= 16 - 1; + if (WARN_ON_ONCE(nr >= NR_EMULATOR_GPRS)) + nr &= NR_EMULATOR_GPRS - 1; ctxt->regs_valid |= 1 << nr; ctxt->regs_dirty |= 1 << nr; @@ -278,7 +278,7 @@ static void writeback_registers(struct x86_emulate_ctxt *ctxt) unsigned long dirty = ctxt->regs_dirty; unsigned reg; - for_each_set_bit(reg, &dirty, 16) + for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 8dff25d267b7..bc3f8295c8c8 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -301,6 +301,17 @@ struct fastop; typedef void (*fastop_t)(struct fastop *); +/* + * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is + * tracked/accessed via _eip, and except for RIP relative addressing, which + * also uses _eip, RIP cannot be a register operand nor can it be an operand in + * a ModRM or SIB byte. + * + * TODO: this is technically wrong for 32-bit KVM, which only supports 8 GPRs; + * R8-R15 don't exist. + */ +#define NR_EMULATOR_GPRS 16 + struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; @@ -363,7 +374,7 @@ struct x86_emulate_ctxt { struct operand src2; struct operand dst; struct operand memop; - unsigned long _regs[NR_VCPU_REGS]; + unsigned long _regs[NR_EMULATOR_GPRS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; -- cgit v1.2.3 From 0cbc60d44c35b1070eb4070b499164d27d050576 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 May 2022 21:08:13 +0000 Subject: KVM: x86: Use 16-bit fields to track dirty/valid emulator GPRs Use a u16 instead of a u32 to track the dirty/valid status of GPRs in the emulator. Unlike struct kvm_vcpu_arch, x86_emulate_ctxt tracks only the "true" GPRs, i.e. doesn't include RIP in its array, and so only needs to track 16 registers. Note, maxing out at 16 GPRs is a fundamental property of x86-64 and will not change barring a massive architecture update. Legacy x86 ModRM and SIB encodings use 3 bits for GPRs, i.e. support 8 registers. x86-64 uses a single bit in the REX prefix for each possible reference type to double the number of supported GPRs to 16 registers (4 bits). Reviewed-by: Kees Cook Signed-off-by: Sean Christopherson Message-Id: <20220526210817.3428868-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 3 +++ arch/x86/kvm/kvm_emulate.h | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c74c0fd3b860..5aaf1d1200af 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -262,6 +262,9 @@ static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) if (WARN_ON_ONCE(nr >= NR_EMULATOR_GPRS)) nr &= NR_EMULATOR_GPRS - 1; + BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + ctxt->regs_valid |= 1 << nr; ctxt->regs_dirty |= 1 << nr; return &ctxt->_regs[nr]; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index bc3f8295c8c8..3a65d6ea7fe6 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -356,9 +356,9 @@ struct x86_emulate_ctxt { u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ - u32 regs_valid; + u16 regs_valid; /* bitmaps of registers in _regs[] that have been written */ - u32 regs_dirty; + u16 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; -- cgit v1.2.3 From b443183a25ab61840a12de92f8822849e017b9c8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 May 2022 21:08:14 +0000 Subject: KVM: x86: Reduce the number of emulator GPRs to '8' for 32-bit KVM Reduce the number of GPRs emulated by 32-bit KVM from 16 to 8. KVM does not support emulating 64-bit mode on 32-bit host kernels, and so should never generate accesses to R8-15. Opportunistically use NR_EMULATOR_GPRS in rsm_load_state_{32,64}() now that it is precise and accurate for both flavors. Wrap the definition with full #ifdef ugliness; sadly, IS_ENABLED() doesn't guarantee a compile-time constant as far as BUILD_BUG_ON() is concerned. Signed-off-by: Sean Christopherson Reviewed-by: Kees Cook Message-Id: <20220526210817.3428868-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 ++-- arch/x86/kvm/kvm_emulate.h | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5aaf1d1200af..77161f57c8d3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2441,7 +2441,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - for (i = 0; i < 8; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); @@ -2498,7 +2498,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u16 selector; int i, r; - for (i = 0; i < 16; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 3a65d6ea7fe6..034c845b3c63 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -306,11 +306,12 @@ typedef void (*fastop_t)(struct fastop *); * tracked/accessed via _eip, and except for RIP relative addressing, which * also uses _eip, RIP cannot be a register operand nor can it be an operand in * a ModRM or SIB byte. - * - * TODO: this is technically wrong for 32-bit KVM, which only supports 8 GPRs; - * R8-R15 don't exist. */ +#ifdef CONFIG_X86_64 #define NR_EMULATOR_GPRS 16 +#else +#define NR_EMULATOR_GPRS 8 +#endif struct x86_emulate_ctxt { void *vcpu; -- cgit v1.2.3 From 1cca2f8c501fa01e6c0d2aefbf97f8c3c89c0186 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 May 2022 21:08:15 +0000 Subject: KVM: x86: Bug the VM if the emulator accesses a non-existent GPR Bug the VM, i.e. kill it, if the emulator accesses a non-existent GPR, i.e. generates an out-of-bounds GPR index. Continuing on all but gaurantees some form of data corruption in the guest, e.g. even if KVM were to redirect to a dummy register, KVM would be incorrectly read zeros and drop writes. Note, bugging the VM doesn't completely prevent data corruption, e.g. the current round of emulation will complete before the vCPU bails out to userspace. But, the very act of killing the guest can also cause data corruption, e.g. due to lack of file writeback before termination, so taking on additional complexity to cleanly bail out of the emulator isn't justified, the goal is purely to stem the bleeding and alert userspace that something has gone horribly wrong, i.e. to avoid _silent_ data corruption. Signed-off-by: Sean Christopherson Reviewed-by: Kees Cook Reviewed-by: Vitaly Kuznetsov Message-Id: <20220526210817.3428868-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 ++-- arch/x86/kvm/kvm_emulate.h | 10 ++++++++++ arch/x86/kvm/x86.c | 9 +++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 77161f57c8d3..70a8e0cd9fdc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -247,7 +247,7 @@ enum x86_transfer_type { static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { - if (WARN_ON_ONCE(nr >= NR_EMULATOR_GPRS)) + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) nr &= NR_EMULATOR_GPRS - 1; if (!(ctxt->regs_valid & (1 << nr))) { @@ -259,7 +259,7 @@ static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) { - if (WARN_ON_ONCE(nr >= NR_EMULATOR_GPRS)) + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) nr &= NR_EMULATOR_GPRS - 1; BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 034c845b3c63..89246446d6aa 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -89,6 +89,7 @@ struct x86_instruction_info { #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ struct x86_emulate_ops { + void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* * read_gpr: read a general purpose register (rax - r15) * @@ -383,6 +384,15 @@ struct x86_emulate_ctxt { bool is_branch; }; +#define KVM_EMULATOR_BUG_ON(cond, ctxt) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ON_ONCE(__ret)) \ + ctxt->ops->vm_bugged(ctxt); \ + unlikely(__ret); \ +}) + /* Repeat String Operation Prefix */ #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8177d8b378e..a0baf49bb00d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7915,7 +7915,16 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); } +static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) +{ + struct kvm *kvm = emul_to_vcpu(ctxt)->kvm; + + if (!kvm->vm_bugged) + kvm_vm_bugged(kvm); +} + static const struct x86_emulate_ops emulate_ops = { + .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, -- cgit v1.2.3 From 49a1431d3bea4092082b0082cd9f58f3ccdf57f4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 May 2022 21:08:16 +0000 Subject: KVM: x86: Bug the VM if the emulator generates a bogus exception vector Bug the VM if KVM's emulator attempts to inject a bogus exception vector. The guest is likely doomed even if KVM continues on, and propagating a bad vector to the rest of KVM runs the risk of breaking other assumptions in KVM and thus triggering a more egregious bug. All existing users of emulate_exception() have hardcoded vector numbers (__load_segment_descriptor() uses a few different vectors, but they're all hardcoded), and future users are likely to follow suit, i.e. the change to emulate_exception() is a glorified nop. As for the ctxt->exception.vector check in x86_emulate_insn(), the few known times the WARN has been triggered in the past is when the field was not set when synthesizing a fault, i.e. for all intents and purposes the check protects against consumption of uninitialized data. Signed-off-by: Sean Christopherson Reviewed-by: Kees Cook Reviewed-by: Vitaly Kuznetsov Message-Id: <20220526210817.3428868-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 70a8e0cd9fdc..2aa17462a9ac 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -624,7 +624,9 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { - WARN_ON(vec > 0x1f); + if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt)) + return X86EMUL_UNHANDLEABLE; + ctxt->exception.vector = vec; ctxt->exception.error_code = error; ctxt->exception.error_code_valid = valid; @@ -5728,7 +5730,8 @@ writeback: done: if (rc == X86EMUL_PROPAGATE_FAULT) { - WARN_ON(ctxt->exception.vector > 0x1f); + if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt)) + return EMULATION_FAILED; ctxt->have_exception = true; } if (rc == X86EMUL_INTERCEPTED) -- cgit v1.2.3 From d38ea9579ce34dfe22378788e99f26eab31ea064 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 May 2022 21:08:17 +0000 Subject: KVM: x86: Bug the VM on an out-of-bounds data read Bug the VM and terminate emulation if an out-of-bounds read into the emulator's data cache occurs. Knowingly contuining on all but guarantees that KVM will overwrite random kernel data, which is far, far worse than killing the VM. Signed-off-by: Sean Christopherson Reviewed-by: Kees Cook Reviewed-by: Vitaly Kuznetsov Message-Id: <20220526210817.3428868-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2aa17462a9ac..39ea9138224c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1373,7 +1373,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - WARN_ON((mc->end + size) >= sizeof(mc->data)); + if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt)) + return X86EMUL_UNHANDLEABLE; rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, &ctxt->exception); -- cgit v1.2.3 From 5bdae49fc2f689b5f896b54bd9230425d3643dab Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2022 08:03:53 -0400 Subject: KVM: SEV: fix misplaced closing parenthesis This caused a warning on 32-bit systems, but undoubtedly would have acted funny on 64-bit as well. The fix was applied directly on merge in 5.19, see commit 24625f7d91fb ("Merge tag for-linus of git://git.kernel.org/pub/scm/virt/kvm/kvm"). Fixes: 3743c2f02517 ("KVM: x86: inhibit APICv/AVIC on changes to APIC ID or APIC base") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 5542d8959e11..d1bc5820ea46 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -908,9 +908,9 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_PIT_REINJ) | BIT(APICV_INHIBIT_REASON_X2APIC) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | - BIT(APICV_INHIBIT_REASON_SEV | + BIT(APICV_INHIBIT_REASON_SEV) | BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)); + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); return supported & BIT(reason); } -- cgit v1.2.3 From e5380f6d7586ea3ef3a55d8cf19ceffacea31392 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Jun 2022 21:42:37 +0000 Subject: KVM: SVM: Hide SEV migration lockdep goo behind CONFIG_PROVE_LOCKING Wrap the manipulation of @role and the manual mutex_{release,acquire}() invocations in CONFIG_PROVE_LOCKING=y to squash a clang-15 warning. When building with -Wunused-but-set-parameter and CONFIG_DEBUG_LOCK_ALLOC=n, clang-15 seees there's no usage of @role in mutex_lock_killable_nested() and yells. PROVE_LOCKING selects DEBUG_LOCK_ALLOC, and the only reason KVM manipulates @role is to make PROVE_LOCKING happy. To avoid true ugliness, use "i" and "j" to detect the first pass in the loops; the "idx" field that's used by kvm_for_each_vcpu() is guaranteed to be '0' on the first pass as it's simply the first entry in the vCPUs XArray, which is fully KVM controlled. kvm_for_each_vcpu() passes '0' for xa_for_each_range()'s "start", and xa_for_each_range() will not enter the loop if there's no entry at '0'. Fixes: 0c2c7c069285 ("KVM: SEV: Mark nested locking of vcpu->lock") Reported-by: kernel test robot Cc: Peter Gonda Signed-off-by: Sean Christopherson Message-Id: <20220613214237.2538266-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 51fd985cf21d..309bcdb2f929 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1606,38 +1606,35 @@ static int sev_lock_vcpus_for_migration(struct kvm *kvm, { struct kvm_vcpu *vcpu; unsigned long i, j; - bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; - if (first) { +#ifdef CONFIG_PROVE_LOCKING + if (!i) /* * Reset the role to one that avoids colliding with * the role used for the first vcpu mutex. */ role = SEV_NR_MIGRATION_ROLES; - first = false; - } else { + else mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); - } +#endif } return 0; out_unlock: - first = true; kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; - if (first) - first = false; - else +#ifdef CONFIG_PROVE_LOCKING + if (j) mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); - +#endif mutex_unlock(&vcpu->mutex); } -- cgit v1.2.3 From fc10020ac9ece7aacea87753beafc0fbd49e8c58 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 5 Jun 2022 14:34:13 +0800 Subject: KVM: X86/MMU: Remove unused PT32_DIR_BASE_ADDR_MASK from mmu.c It is unused. Signed-off-by: Lai Jiangshan Message-Id: <20220605063417.308311-3-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17252f39bd7c..f168693695bd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -125,8 +125,6 @@ module_param(dbg, bool, 0644); #define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) #define PT32_LVL_ADDR_MASK(level) \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -- cgit v1.2.3 From 024c3c3304ca36c23ee400b507fb59ae2e2db7fa Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 5 Jun 2022 14:34:16 +0800 Subject: KVM: X86/MMU: Remove useless mmu_topup_memory_caches() in kvm_mmu_pte_write() Since the commit c5e2184d1544("KVM: x86/mmu: Remove the defunct update_pte() paging hook"), kvm_mmu_pte_write() no longer uses the rmap cache. So remove mmu_topup_memory_caches() in it. Cc: Sean Christopherson Signed-off-by: Lai Jiangshan Message-Id: <20220605063417.308311-6-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f168693695bd..0a4438544c0d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5326,13 +5326,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - /* - * No need to care whether allocation memory is successful - * or not since pte prefetch is skipped if it does not have - * enough objects in the cache. - */ - mmu_topup_memory_caches(vcpu, true); - write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); -- cgit v1.2.3 From 78c7d9001be704b7d13083333354c6ddf6e32fce Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 5 Jun 2022 14:34:17 +0800 Subject: KVM: X86/SVM: Use root_level in svm_load_mmu_pgd() Use root_level in svm_load_mmu_pg() rather that looking up the root level in vcpu->arch.mmu->root_role.level. svm_load_mmu_pgd() has only one caller, kvm_mmu_load_pgd(), which always passes vcpu->arch.mmu->root_role.level as root_level. Signed-off-by: Lai Jiangshan Message-Id: <20220605063417.308311-7-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c6cca0ce127b..2ba0c62df8fb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4049,7 +4049,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); cr3 = vcpu->arch.cr3; - } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) { + } else if (root_level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); } else { /* PCID in the guest should be impossible with a 32-bit MMU. */ -- cgit v1.2.3 From 007a369fba3c120d9a343bb2e8f04cf64c988183 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Jun 2022 22:57:16 +0000 Subject: KVM: x86/mmu: Drop unused CMPXCHG macro from paging_tmpl.h Drop the CMPXCHG macro from paging_tmpl.h, it's no longer used now that KVM uses a common uaccess helper to do 8-byte CMPXCHG. Fixes: f122dfe44768 ("KVM: x86: Use __try_cmpxchg_user() to update guest PTE A/D bits") Signed-off-by: Sean Christopherson Message-Id: <20220613225723.2734132-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index fe35d8fd3276..f595c4b8657f 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -34,7 +34,6 @@ #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG "cmpxchgq" #else #define PT_MAX_FULL_LEVELS 2 #endif @@ -51,7 +50,6 @@ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG "cmpxchgl" #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT @@ -64,9 +62,6 @@ #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) - #ifdef CONFIG_X86_64 - #define CMPXCHG "cmpxchgq" - #endif #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value @@ -1100,7 +1095,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn #undef gpte_to_gfn_lvl -#undef CMPXCHG #undef PT_GUEST_ACCESSED_MASK #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT -- cgit v1.2.3 From d895f28ed6da96d7b922bd79977e2b732b103a99 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Jun 2022 21:41:40 +0000 Subject: KVM: VMX: Skip filter updates for MSRs that KVM is already intercepting When handling userspace MSR filter updates, recompute interception for possible passthrough MSRs if and only if KVM wants to disabled interception. If KVM wants to intercept accesses, i.e. the associated bit is set in vmx->shadow_msr_intercept, then there's no need to set the intercept again as KVM will intercept the MSR regardless of userspace's wants. No functional change intended, the call to vmx_enable_intercept_for_msr() really is just a gigantic nop. Suggested-by: Aaron Lewis Signed-off-by: Sean Christopherson Message-Id: <20220610214140.612025-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5e14e4c40007..61962f3c4b28 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3981,17 +3981,21 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) u32 i; /* - * Set intercept permissions for all potentially passed through MSRs - * again. They will automatically get filtered through the MSR filter, - * so we are back in sync after this. + * Redo intercept permissions for MSRs that KVM is passing through to + * the guest. Disabling interception will check the new MSR filter and + * ensure that KVM enables interception if usersepace wants to filter + * the MSR. MSRs that KVM is already intercepting don't need to be + * refreshed since KVM is going to intercept them regardless of what + * userspace wants. */ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { u32 msr = vmx_possible_passthrough_msrs[i]; - bool read = test_bit(i, vmx->shadow_msr_intercept.read); - bool write = test_bit(i, vmx->shadow_msr_intercept.write); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + if (!test_bit(i, vmx->shadow_msr_intercept.read)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); + + if (!test_bit(i, vmx->shadow_msr_intercept.write)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); } pt_update_intercept_for_msr(vcpu); -- cgit v1.2.3 From aee98a6838d52d5cca14610d228893e9208f4ed1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 18 May 2022 15:51:11 +0200 Subject: KVM: x86/mmu: Use try_cmpxchg64 in tdp_mmu_set_spte_atomic Use try_cmpxchg64 instead of cmpxchg64 (*ptr, old, new) != old in tdp_mmu_set_spte_atomic. cmpxchg returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, remove explicit assignment to iter->old_spte when cmpxchg fails, this is what try_cmpxchg does implicitly. Cc: Paolo Bonzini Cc: Sean Christopherson Signed-off-by: Uros Bizjak Reviewed-by: David Matlack Message-Id: <20220518135111.3535-1-ubizjak@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b9265d67131..2cd060eb34a4 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -633,7 +633,6 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, u64 new_spte) { u64 *sptep = rcu_dereference(iter->sptep); - u64 old_spte; /* * The caller is responsible for ensuring the old SPTE is not a REMOVED @@ -649,17 +648,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and * does not hold the mmu_lock. */ - old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); - if (old_spte != iter->old_spte) { - /* - * The page table entry was modified by a different logical - * CPU. Refresh iter->old_spte with the current value so the - * caller operates on fresh data, e.g. if it retries - * tdp_mmu_set_spte_atomic(). - */ - iter->old_spte = old_spte; + if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; - } __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); -- cgit v1.2.3 From 0ac304de73b37b66793d6cc1ad3a03886aa79791 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 20 May 2022 16:37:37 +0200 Subject: KVM: VMX: Use try_cmpxchg64 in pi_try_set_control Use try_cmpxchg64 instead of cmpxchg64 (*ptr, old, new) != old in pi_try_set_control. cmpxchg returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg): b9: 88 44 24 60 mov %al,0x60(%rsp) bd: 48 89 c8 mov %rcx,%rax c0: c6 44 24 62 f2 movb $0xf2,0x62(%rsp) c5: 48 8b 74 24 60 mov 0x60(%rsp),%rsi ca: f0 49 0f b1 34 24 lock cmpxchg %rsi,(%r12) d0: 48 39 c1 cmp %rax,%rcx d3: 75 cf jne a4 patched: c1: 88 54 24 60 mov %dl,0x60(%rsp) c5: c6 44 24 62 f2 movb $0xf2,0x62(%rsp) ca: 48 8b 54 24 60 mov 0x60(%rsp),%rdx cf: f0 48 0f b1 13 lock cmpxchg %rdx,(%rbx) d4: 75 d5 jne ab Signed-off-by: Uros Bizjak Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Reported-by: kernel test robot Message-Id: <20220520143737.62513-1-ubizjak@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 237a1f40f939..73f60aa480fe 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -42,7 +42,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) * update must be retried with a fresh snapshot an ON change causes * the cmpxchg to fail. */ - if (cmpxchg64(&pi_desc->control, old, new) != old) + if (!try_cmpxchg64(&pi_desc->control, &old, new)) return -EBUSY; return 0; -- cgit v1.2.3 From 2db2f46fdfc25691f3e90224e78bc5b2fc23dcd7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 20 May 2022 16:46:35 +0200 Subject: KVM: x86/mmu: Use try_cmpxchg64 in fast_pf_fix_direct_spte Use try_cmpxchg64 instead of cmpxchg64 (*ptr, old, new) != old in fast_pf_fix_direct_spte. cmpxchg returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Signed-off-by: Uros Bizjak Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Message-Id: <20220520144635.63134-1-ubizjak@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0a4438544c0d..34785bc3077d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3093,7 +3093,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) -- cgit v1.2.3 From fa578398a0ba2c079fa1170da21fa5baae0cedb2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 21:58:27 +0000 Subject: KVM: nVMX: Snapshot pre-VM-Enter BNDCFGS for !nested_run_pending case If a nested run isn't pending, snapshot vmcs01.GUEST_BNDCFGS irrespective of whether or not VM_ENTRY_LOAD_BNDCFGS is set in vmcs12. When restoring nested state, e.g. after migration, without a nested run pending, prepare_vmcs02() will propagate nested.vmcs01_guest_bndcfgs to vmcs02, i.e. will load garbage/zeros into vmcs02.GUEST_BNDCFGS. If userspace restores nested state before MSRs, then loading garbage is a non-issue as loading BNDCFGS will also update vmcs02. But if usersepace restores MSRs first, then KVM is responsible for propagating L2's value, which is actually thrown into vmcs01, into vmcs02. Restoring L2 MSRs into vmcs01, i.e. loading all MSRs before nested state is all kinds of bizarre and ideally would not be supported. Sadly, some VMMs do exactly that and rely on KVM to make things work. Note, there's still a lurking SMM bug, as propagating vmcs01.GUEST_BNDFGS to vmcs02 across RSM may corrupt L2's BNDCFGS. But KVM's entire VMX+SMM emulation is flawed as SMI+RSM should not toouch _any_ VMCS when use the "default treatment of SMIs", i.e. when not using an SMI Transfer Monitor. Link: https://lore.kernel.org/all/Yobt1XwOfb5M6Dfa@google.com Fixes: 62cf9bd8118c ("KVM: nVMX: Fix emulation of VM_ENTRY_LOAD_BNDCFGS") Cc: stable@vger.kernel.org Cc: Lei Wang Signed-off-by: Sean Christopherson Message-Id: <20220614215831.3762138-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7d8cd0ebcc75..66c25bb56938 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3382,7 +3382,8 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (kvm_mpx_supported() && - !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* -- cgit v1.2.3 From 764643a6be07445308e492a528197044c801b3ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 21:58:28 +0000 Subject: KVM: nVMX: Snapshot pre-VM-Enter DEBUGCTL for !nested_run_pending case If a nested run isn't pending, snapshot vmcs01.GUEST_IA32_DEBUGCTL irrespective of whether or not VM_ENTRY_LOAD_DEBUG_CONTROLS is set in vmcs12. When restoring nested state, e.g. after migration, without a nested run pending, prepare_vmcs02() will propagate nested.vmcs01_debugctl to vmcs02, i.e. will load garbage/zeros into vmcs02.GUEST_IA32_DEBUGCTL. If userspace restores nested state before MSRs, then loading garbage is a non-issue as loading DEBUGCTL will also update vmcs02. But if usersepace restores MSRs first, then KVM is responsible for propagating L2's value, which is actually thrown into vmcs01, into vmcs02. Restoring L2 MSRs into vmcs01, i.e. loading all MSRs before nested state is all kinds of bizarre and ideally would not be supported. Sadly, some VMMs do exactly that and rely on KVM to make things work. Note, there's still a lurking SMM bug, as propagating vmcs01's DEBUGCTL to vmcs02 across RSM may corrupt L2's DEBUGCTL. But KVM's entire VMX+SMM emulation is flawed as SMI+RSM should not toouch _any_ VMCS when use the "default treatment of SMIs", i.e. when not using an SMI Transfer Monitor. Link: https://lore.kernel.org/all/Yobt1XwOfb5M6Dfa@google.com Fixes: 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220614215831.3762138-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 66c25bb56938..4a53e0c73445 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3379,7 +3379,8 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || -- cgit v1.2.3 From 5d76b1f8c79309c0b60c8db5f16774f1691945a7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 21:58:29 +0000 Subject: KVM: nVMX: Rename nested.vmcs01_* fields to nested.pre_vmenter_* Rename the fields in struct nested_vmx used to snapshot pre-VM-Enter values to reflect that they can hold L2's values when restoring nested state, e.g. if userspace restores MSRs before nested state. As crazy as it seems, restoring MSRs before nested state actually works (because KVM goes out if it's way to make it work), even though the initial MSR writes will hit vmcs01 despite holding L2 values. Add a related comment to vmx_enter_smm() to call out that using the common VM-Exit and VM-Enter helpers to emulate SMI and RSM is wrong and broken. The few MSRs that have snapshots _could_ be fixed by taking a snapshot prior to the forced VM-Exit instead of at forced VM-Enter, but that's just the tip of the iceberg as the rather long list of MSRs that aren't snapshotted (hello, VM-Exit MSR load list) can't be handled this way. Signed-off-by: Sean Christopherson Message-Id: <20220614215831.3762138-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 7 +++++++ arch/x86/kvm/vmx/vmx.h | 15 ++++++++++++--- 3 files changed, 23 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4a53e0c73445..38015f4ecc54 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2520,11 +2520,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); } if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the @@ -3381,11 +3381,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) - vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 61962f3c4b28..b4d3820e9800 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7847,6 +7847,13 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * TODO: Implement custom flows for forcing the vCPU out/in of L2 on + * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong + * SMI and RSM only modify state that is saved and restored via SMRAM. + * E.g. most MSRs are left untouched, but many are modified by VM-Exit + * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. + */ vmx->nested.smm.guest_mode = is_guest_mode(vcpu); if (vmx->nested.smm.guest_mode) nested_vmx_vmexit(vcpu, -1, 0, 0); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 71bcb486e73f..a84c91ee2a48 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -219,9 +219,18 @@ struct nested_vmx { bool has_preemption_timer_deadline; bool preemption_timer_expired; - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - u64 vmcs01_guest_bndcfgs; + /* + * Used to snapshot MSRs that are conditionally loaded on VM-Enter in + * order to propagate the guest's pre-VM-Enter value into vmcs02. For + * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value. + * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_ + * userspace restores MSRs before nested state. If userspace restores + * MSRs after nested state, the snapshot holds garbage, but KVM can't + * detect that, and the garbage value in vmcs02 will be overwritten by + * MSR restoration in any case. + */ + u64 pre_vmenter_debugctl; + u64 pre_vmenter_bndcfgs; /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold; -- cgit v1.2.3 From 308a4fffeb361af90f17837c1bcace3139d1f677 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 21:58:30 +0000 Subject: KVM: nVMX: Save BNDCFGS to vmcs12 iff relevant controls are exposed to L1 Save BNDCFGS to vmcs12 (from vmcs02) if and only if at least of one of the load-on-entry or clear-on-exit fields for BNDCFGS is enumerated as an allowed-1 bit in vmcs12. Skipping the field avoids an unnecessary VMREAD when MPX is supported but not exposed to L1. Per Intel's SDM: If the processor supports either the 1-setting of the "load IA32_BNDCFGS" VM-entry control or that of the "clear IA32_BNDCFGS" VM-exit control, the contents of the IA32_BNDCFGS MSR are saved into the corresponding field. Signed-off-by: Sean Christopherson Message-Id: <20220614215831.3762138-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 38015f4ecc54..496981b86f94 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4104,7 +4104,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if (kvm_mpx_supported()) + if ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || + (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; -- cgit v1.2.3 From 913d6c9b8fe48f0836c00e359fb1ea39089d25e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 21:58:31 +0000 Subject: KVM: nVMX: Update vmcs12 on BNDCFGS write, not at vmcs02=>vmcs12 sync Update vmcs12->guest_bndcfgs on intercepted writes to BNDCFGS from L2 instead of waiting until vmcs02 is synchronized to vmcs12. KVM always intercepts BNDCFGS accesses, so the only way the value in vmcs02 can change is via KVM's explicit VMWRITE during emulation. Signed-off-by: Sean Christopherson Message-Id: <20220614215831.3762138-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 --- arch/x86/kvm/vmx/vmx.c | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 496981b86f94..aad938e1e51d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4104,9 +4104,6 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || - (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b4d3820e9800..e4c16ee879d5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2044,6 +2044,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (is_noncanonical_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; + + if (is_guest_mode(vcpu) && + ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || + (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) + get_vmcs12(vcpu)->guest_bndcfgs = data; + vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: -- cgit v1.2.3 From ec1d7e6ab9ff15d9ff7b3628a4320907544675e1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:05:44 +0000 Subject: KVM: SVM: Drop unused AVIC / kvm_x86_ops declarations Drop a handful of unused AVIC function declarations whose implementations were removed during the conversion to optional static calls. No functional change intended. Fixes: abb6d479e226 ("KVM: x86: make several APIC virtualization callbacks optional") Signed-off-by: Sean Christopherson Message-Id: <20220614230548.3852141-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 128993feb4c6..d51de3c9264a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -617,12 +617,8 @@ int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); -void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From d39850f57d2102c6b46feb21237bc23bc42de4f7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:05:45 +0000 Subject: KVM: x86: Drop @vcpu parameter from kvm_x86_ops.hwapic_isr_update() Drop the unused @vcpu parameter from hwapic_isr_update(). AMD/AVIC is unlikely to implement the helper, and VMX/APICv doesn't need the vCPU as it operates on the current VMCS. The result is somewhat odd, but allows for a decent amount of (future) cleanup in the APIC code. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614230548.3852141-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/lapic.c | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7e98b2876380..16acc54d49a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1517,7 +1517,7 @@ struct kvm_x86_ops { bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); - void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + void (*hwapic_isr_update)(int isr); bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a413a1d8df4c..cc0da5671eb9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -556,7 +556,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec); + static_call_cond(kvm_x86_hwapic_isr_update)(vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -604,7 +604,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -2457,7 +2457,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (vcpu->arch.apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); + static_call_cond(kvm_x86_hwapic_isr_update)(-1); } vcpu->arch.apic_arb_prio = 0; @@ -2737,7 +2737,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) if (vcpu->arch.apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e4c16ee879d5..380c8e7c1fb8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6566,7 +6566,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) put_page(page); } -static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +static void vmx_hwapic_isr_update(int max_isr) { u16 status; u8 old; -- cgit v1.2.3 From ae801e1303e939ad5ebd9f390bdcc57275ada33b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:05:46 +0000 Subject: KVM: x86: Check for in-kernel xAPIC when querying APICv for directed yield Use kvm_vcpu_apicv_active() to check if APICv is active when seeing if a vCPU is a candidate for directed yield due to a pending ACPIv interrupt. This will allow moving apicv_active into kvm_lapic without introducing a potential NULL pointer deref (kvm_vcpu_apicv_active() effectively adds a pre-check on the vCPU having an in-kernel APIC). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614230548.3852141-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a0baf49bb00d..db35d40bf996 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12395,7 +12395,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { - if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + if (kvm_vcpu_apicv_active(vcpu) && + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) return true; return false; -- cgit v1.2.3 From ce0a58f4756c14d7646cfdf279dbaada9d7712a0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:05:47 +0000 Subject: KVM: x86: Move "apicv_active" into "struct kvm_lapic" Move the per-vCPU apicv_active flag into KVM's local APIC instance. APICv is fully dependent on an in-kernel local APIC, but that's not at all clear when reading the current code due to the flag being stored in the generic kvm_vcpu_arch struct. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614230548.3852141-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/lapic.c | 30 ++++++++++-------------------- arch/x86/kvm/lapic.h | 3 ++- arch/x86/kvm/svm/svm.c | 5 +++-- arch/x86/kvm/vmx/vmx.c | 3 ++- arch/x86/kvm/x86.c | 11 ++++++----- 6 files changed, 23 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 16acc54d49a7..1038ccb7056a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -663,7 +663,6 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ - bool apicv_active; bool load_eoi_exitmap_pending; DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cc0da5671eb9..43c42a580295 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -519,14 +519,11 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - - vcpu = apic->vcpu; - - if (unlikely(vcpu->arch.apicv_active)) { + if (unlikely(apic->apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu, + apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -543,19 +540,15 @@ EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * With APIC virtualization enabled, all caching is disabled * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(vcpu->arch.apicv_active)) + if (unlikely(apic->apicv_active)) static_call_cond(kvm_x86_hwapic_isr_update)(vec); else { ++apic->isr_count; @@ -590,12 +583,9 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * We do get here for APIC virtualization enabled if the guest * uses the Hyper-V APIC enlightenment. In this case we may need @@ -603,7 +593,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(vcpu->arch.apicv_active)) + if (unlikely(apic->apicv_active)) static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); else { --apic->isr_count; @@ -1584,7 +1574,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; - if (vcpu->arch.apicv_active) + if (apic->apicv_active) bitmap = apic->regs + APIC_IRR; if (apic_test_vector(vec, bitmap)) @@ -1701,7 +1691,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; - if (!from_timer_fn && vcpu->arch.apicv_active) { + if (!from_timer_fn && apic->apicv_active) { WARN_ON(kvm_get_running_vcpu() != vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; @@ -2379,7 +2369,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { /* irr_pending is always true when apicv is activated. */ apic->irr_pending = true; apic->isr_count = 1; @@ -2454,7 +2444,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); static_call_cond(kvm_x86_hwapic_isr_update)(-1); @@ -2734,7 +2724,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 65bb2a8cf145..e09ad97f3250 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,6 +48,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool apicv_active; bool sw_enabled; bool irr_pending; bool lvt0_in_nmi_mode; @@ -204,7 +205,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic && vcpu->arch.apicv_active; + return vcpu->arch.apic && vcpu->arch.apic->apicv_active; } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2ba0c62df8fb..136298cfb3fb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3465,12 +3465,13 @@ void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vector) { /* - * vcpu->arch.apicv_active must be read after vcpu->mode. + * apic->apicv_active must be read after vcpu->mode. * Pairs with smp_store_release in vcpu_enter_guest. */ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); - if (!READ_ONCE(vcpu->arch.apicv_active)) { + /* Note, this is called iff the local APIC is in-kernel. */ + if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { /* Process the interrupt via inject_pending_event */ kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 380c8e7c1fb8..2609bcc8c130 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4099,7 +4099,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!r) return 0; - if (!vcpu->arch.apicv_active) + /* Note, this is called iff the local APIC is in-kernel. */ + if (!vcpu->arch.apic->apicv_active) return -1; if (pi_test_and_set_pir(vector, &vmx->pi_desc)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index db35d40bf996..00e23dc518e0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9460,7 +9460,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (vcpu->arch.apicv_active) + if (vcpu->arch.apic->apicv_active) return; if (!vcpu->arch.apic->vapic_addr) @@ -9913,6 +9913,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; bool activate; if (!lapic_in_kernel(vcpu)) @@ -9923,10 +9924,10 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) activate = kvm_vcpu_apicv_activated(vcpu); - if (vcpu->arch.apicv_active == activate) + if (apic->apicv_active == activate) goto out; - vcpu->arch.apicv_active = activate; + apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9936,7 +9937,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * still active when the interrupt got accepted. Make sure * inject_pending_event() is called to check for that. */ - if (!vcpu->arch.apicv_active) + if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); out: @@ -11379,7 +11380,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) * will ensure the vCPU gets the correct state before VM-Entry. */ if (enable_apicv) { - vcpu->arch.apicv_active = true; + vcpu->arch.apic->apicv_active = true; kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } } else -- cgit v1.2.3 From b8e1b9626746209c980ce3fbf9ec3fc86910873d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:05:48 +0000 Subject: KVM: x86: Use lapic_in_kernel() to query in-kernel APIC in APICv helper Use lapic_in_kernel() in kvm_vcpu_apicv_active() to take advantage of the kvm_has_noapic_vcpu static branch. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614230548.3852141-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e09ad97f3250..6207b64b1281 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -205,7 +205,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic && vcpu->arch.apic->apicv_active; + return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active; } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 1ae20e0b975caf8ff51511a89cce5e28ec3e4d70 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:33:22 +0000 Subject: KVM: VMX: Refactor 32-bit PSE PT creation to avoid using MMU macro Compute the number of PTEs to be filled for the 32-bit PSE page tables using the page size and the size of each entry. While using the MMU's PT32_ENT_PER_PAGE macro is arguably better in isolation, removing VMX's usage will allow a future namespacing cleanup to move the guest page table macros into paging_tmpl.h, out of the reach of code that isn't directly related to shadow paging. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614233328.3896033-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2609bcc8c130..4002f6cc179d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3710,7 +3710,7 @@ static int init_rmode_identity_map(struct kvm *kvm) } /* Set up identity-mapping pagetable for EPT in real mode */ - for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { -- cgit v1.2.3 From b3fcdb04a98035f55f7ba9e3c87d3c4eb2f95b4b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:33:23 +0000 Subject: KVM: x86/mmu: Bury 32-bit PSE paging helpers in paging_tmpl.h Move a handful of one-off macros and helpers for 32-bit PSE paging into paging_tmpl.h and hide them behind "PTTYPE == 32". Under no circumstance should anything but 32-bit shadow paging care about PSE paging. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614233328.3896033-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 5 ----- arch/x86/kvm/mmu/mmu.c | 7 ------- arch/x86/kvm/mmu/paging_tmpl.h | 18 +++++++++++++++++- 3 files changed, 17 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f8192864b496..d1021e34ac15 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -34,11 +34,6 @@ #define PT_DIR_PAT_SHIFT 12 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK \ - (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - #define PT64_ROOT_5LEVEL 5 #define PT64_ROOT_4LEVEL 4 #define PT32_ROOT_LEVEL 2 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 34785bc3077d..1a49b50d152e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -324,13 +324,6 @@ static int is_cpuid_PSE36(void) return 1; } -static gfn_t pse36_gfn_delta(u32 gpte) -{ - int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; - - return (gpte & PT32_DIR_PSE36_MASK) << shift; -} - #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f595c4b8657f..55fd35b1b227 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -50,6 +50,11 @@ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true + + #define PT32_DIR_PSE36_SIZE 4 + #define PT32_DIR_PSE36_SHIFT 13 + #define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT @@ -92,6 +97,15 @@ struct guest_walker { struct x86_exception fault; }; +#if PTTYPE == 32 +static inline gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} +#endif + static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; @@ -416,8 +430,10 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) +#if PTTYPE == 32 + if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); +#endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); if (real_gpa == UNMAPPED_GVA) -- cgit v1.2.3 From 42c88ff893f06b6ab4eaeb2c37e513edf8c1943b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:33:24 +0000 Subject: KVM: x86/mmu: Dedup macros for computing various page table masks Provide common helper macros to generate various masks, shifts, etc... for 32-bit vs. 64-bit page tables. Only the inputs differ, the actual calculations are identical. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614233328.3896033-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 4 ++-- arch/x86/kvm/mmu/mmu.c | 14 +++++--------- arch/x86/kvm/mmu/mmu_internal.h | 14 ++++++++++++++ arch/x86/kvm/mmu/paging.h | 9 +++++---- arch/x86/kvm/mmu/spte.h | 7 +++---- 5 files changed, 29 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index d1021e34ac15..6efe6bd7fb6e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -7,9 +7,9 @@ #include "cpuid.h" #define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) +#define PT64_ENT_PER_PAGE __PT_ENT_PER_PAGE(PT64_PT_BITS) #define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) +#define PT32_ENT_PER_PAGE __PT_ENT_PER_PAGE(PT32_PT_BITS) #define PT_WRITABLE_SHIFT 1 #define PT_USER_SHIFT 2 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1a49b50d152e..2c5470953579 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -113,21 +113,17 @@ module_param(dbg, bool, 0644); #define PT32_LEVEL_BITS 10 -#define PT32_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) +#define PT32_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, PT32_LEVEL_BITS) #define PT32_LVL_OFFSET_MASK(level) \ - (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) + __PT_LVL_OFFSET_MASK(PT32_BASE_ADDR_MASK, level, PT32_LEVEL_BITS) +#define PT32_INDEX(address, level) __PT_INDEX(address, level, PT32_LEVEL_BITS) #define PT32_BASE_ADDR_MASK PAGE_MASK + #define PT32_LVL_ADDR_MASK(level) \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) + __PT_LVL_ADDR_MASK(PT32_BASE_ADDR_MASK, level, PT32_LEVEL_BITS) #include diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bd2a26897b97..5e1e3c8f8aaa 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -20,6 +20,20 @@ extern bool dbg; #define MMU_WARN_ON(x) do { } while (0) #endif +/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ +#define __PT_LEVEL_SHIFT(level, bits_per_level) \ + (PAGE_SHIFT + ((level) - 1) * (bits_per_level)) +#define __PT_INDEX(address, level, bits_per_level) \ + (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1)) + +#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level)) + /* * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT * bit, and thus are guaranteed to be non-zero when valid. And, when a guest diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h index de8ab323bb70..23f3f64b8092 100644 --- a/arch/x86/kvm/mmu/paging.h +++ b/arch/x86/kvm/mmu/paging.h @@ -4,11 +4,12 @@ #define __KVM_X86_PAGING_H #define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) + #define PT64_LVL_ADDR_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) + __PT_LVL_ADDR_MASK(GUEST_PT64_BASE_ADDR_MASK, level, PT64_LEVEL_BITS) + #define PT64_LVL_OFFSET_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) + __PT_LVL_OFFSET_MASK(GUEST_PT64_BASE_ADDR_MASK, level, PT64_LEVEL_BITS) + #endif /* __KVM_X86_PAGING_H */ diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0127bb6e3c7d..d5a8183b7232 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -55,11 +55,10 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define PT64_LEVEL_BITS 9 -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) +#define PT64_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, PT64_LEVEL_BITS) + +#define PT64_INDEX(address, level) __PT_INDEX(address, level, PT64_LEVEL_BITS) -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) /* -- cgit v1.2.3 From 2ca3129e8045b18eb15431b485d40c028fe8fb00 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:33:25 +0000 Subject: KVM: x86/mmu: Use separate namespaces for guest PTEs and shadow PTEs Separate the macros for KVM's shadow PTEs (SPTE) from guest 64-bit PTEs (PT64). SPTE and PT64 are _mostly_ the same, but the few differences are quite critical, e.g. *_BASE_ADDR_MASK must differentiate between host and guest physical address spaces, and SPTE_PERM_MASK (was PT64_PERM_MASK) is very much specific to SPTEs. Opportunistically (and temporarily) move most guest macros into paging.h to clearly associate them with shadow paging, and to ensure that they're not used as of this commit. A future patch will eliminate them entirely. Sadly, PT32_LEVEL_BITS is left behind in mmu_internal.h because it's needed for the quadrant calculation in kvm_mmu_get_page(). The quadrant calculation is hot enough (when using shadow paging with 32-bit guests) that adding a per-context helper is undesirable, and burying the computation in paging_tmpl.h with a forward declaration isn't exactly an improvement. Signed-off-by: Sean Christopherson Message-Id: <20220614233328.3896033-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 5 ---- arch/x86/kvm/mmu/mmu.c | 60 ++++++++++++++++++++++-------------------- arch/x86/kvm/mmu/paging.h | 17 ++++++++++++ arch/x86/kvm/mmu/paging_tmpl.h | 6 ++--- arch/x86/kvm/mmu/spte.c | 2 +- arch/x86/kvm/mmu/spte.h | 27 +++++++++---------- arch/x86/kvm/mmu/tdp_iter.c | 6 ++--- arch/x86/kvm/mmu/tdp_mmu.c | 6 ++--- 8 files changed, 70 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 6efe6bd7fb6e..a99acec925eb 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -6,11 +6,6 @@ #include "kvm_cache_regs.h" #include "cpuid.h" -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE __PT_ENT_PER_PAGE(PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE __PT_ENT_PER_PAGE(PT32_PT_BITS) - #define PT_WRITABLE_SHIFT 1 #define PT_USER_SHIFT 2 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2c5470953579..75f9e344812e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -111,20 +111,6 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, PT32_LEVEL_BITS) - -#define PT32_LVL_OFFSET_MASK(level) \ - __PT_LVL_OFFSET_MASK(PT32_BASE_ADDR_MASK, level, PT32_LEVEL_BITS) - -#define PT32_INDEX(address, level) __PT_INDEX(address, level, PT32_LEVEL_BITS) - -#define PT32_BASE_ADDR_MASK PAGE_MASK - -#define PT32_LVL_ADDR_MASK(level) \ - __PT_LVL_ADDR_MASK(PT32_BASE_ADDR_MASK, level, PT32_LEVEL_BITS) - #include /* make pte_list_desc fit well in cache lines */ @@ -704,7 +690,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) if (!sp->role.direct) return sp->gfns[index]; - return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); + return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) @@ -1776,7 +1762,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2027,8 +2013,24 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role.direct = direct; role.access = access; if (role.has_4_byte_gpte) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; + /* + * If the guest has 4-byte PTEs then that means it's using 32-bit, + * 2-level, non-PAE paging. KVM shadows such guests with PAE paging + * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must + * shadow each guest page table with multiple shadow page tables, which + * requires extra bookkeeping in the role. + * + * Specifically, to shadow the guest's page directory (which covers a + * 4GiB address space), KVM uses 4 PAE page directories, each mapping + * 1GiB of the address space. @role.quadrant encodes which quarter of + * the address space each maps. + * + * To shadow the guest's page tables (which each map a 4MiB region), KVM + * uses 2 PAE page tables, each mapping a 2MiB region. For these, + * @role.quadrant encodes which half of the region they map. + */ + quadrant = gaddr >> (PAGE_SHIFT + (SPTE_LEVEL_BITS * level)); + quadrant &= (1 << level) - 1; role.quadrant = quadrant; } if (level <= vcpu->arch.mmu->cpu_role.base.level) @@ -2132,7 +2134,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; - iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; @@ -2151,7 +2153,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) if (iterator->level < PG_LEVEL_4K) return false; - iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->index = SPTE_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; } @@ -2164,7 +2166,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, return; } - iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; + iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; --iterator->level; } @@ -2203,7 +2205,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2224,7 +2226,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, spte); /* @@ -2250,7 +2252,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, int zapped = 0; unsigned i; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); return zapped; @@ -2663,7 +2665,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3252,7 +3254,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK); if (WARN_ON(!sp)) return; @@ -3724,7 +3726,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= PT64_BASE_ADDR_MASK; + root &= SPTE_BASE_ADDR_MASK; sp = to_shadow_page(root); mmu_sync_children(vcpu, sp, true); } @@ -5186,11 +5188,11 @@ static bool need_remote_flush(u64 old, u64 new) return false; if (!is_shadow_present_pte(new)) return true; - if ((old ^ new) & PT64_BASE_ADDR_MASK) + if ((old ^ new) & SPTE_BASE_ADDR_MASK) return true; old ^= shadow_nx_mask; new ^= shadow_nx_mask; - return (old & ~new & PT64_PERM_MASK) != 0; + return (old & ~new & SPTE_PERM_MASK) != 0; } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h index 23f3f64b8092..6a63727cc7e8 100644 --- a/arch/x86/kvm/mmu/paging.h +++ b/arch/x86/kvm/mmu/paging.h @@ -5,11 +5,28 @@ #define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_LEVEL_BITS 9 + +#define PT64_INDEX(address, level) __PT_INDEX(address, level, PT64_LEVEL_BITS) + #define PT64_LVL_ADDR_MASK(level) \ __PT_LVL_ADDR_MASK(GUEST_PT64_BASE_ADDR_MASK, level, PT64_LEVEL_BITS) #define PT64_LVL_OFFSET_MASK(level) \ __PT_LVL_OFFSET_MASK(GUEST_PT64_BASE_ADDR_MASK, level, PT64_LEVEL_BITS) + +#define PT32_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, PT32_LEVEL_BITS) + +#define PT32_LVL_OFFSET_MASK(level) \ + __PT_LVL_OFFSET_MASK(PT32_BASE_ADDR_MASK, level, PT32_LEVEL_BITS) + +#define PT32_INDEX(address, level) __PT_INDEX(address, level, PT32_LEVEL_BITS) + +#define PT32_BASE_ADDR_MASK PAGE_MASK + +#define PT32_LVL_ADDR_MASK(level) \ + __PT_LVL_ADDR_MASK(PT32_BASE_ADDR_MASK, level, PT32_LEVEL_BITS) + #endif /* __KVM_X86_PAGING_H */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 55fd35b1b227..3ec90303e2f5 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -45,7 +45,7 @@ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT @@ -899,7 +899,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) WARN_ON(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; + offset = sp->role.quadrant << SPTE_LEVEL_BITS; return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } @@ -1034,7 +1034,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { u64 *sptep, spte; struct kvm_memory_slot *slot; unsigned pte_access; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index cda1851ec155..242e4828d7df 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -301,7 +301,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) { u64 new_spte; - new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte = old_spte & ~SPTE_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index d5a8183b7232..121c5eaaec77 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -36,12 +36,12 @@ extern bool __read_mostly enable_mmio_caching; static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) #else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ +#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 @@ -50,16 +50,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) /* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull +#define SPTE_EPT_READABLE_MASK 0x1ull +#define SPTE_EPT_EXECUTABLE_MASK 0x4ull -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level) __PT_INDEX(address, level, PT64_LEVEL_BITS) - -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +#define SPTE_LEVEL_BITS 9 +#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) +#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) +#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -68,8 +65,8 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); * restored only when a write is attempted to the page. This mask obviously * must not overlap the A/D type mask. */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ + SPTE_EPT_EXECUTABLE_MASK) #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) @@ -281,7 +278,7 @@ static inline bool is_executable_pte(u64 spte) static inline kvm_pfn_t spte_to_pfn(u64 pte) { - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; } static inline bool is_accessed_spte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index ee4802d7b36c..9c65a64a56d9 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -11,7 +11,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + - SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level); iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } @@ -116,8 +116,8 @@ static bool try_step_side(struct tdp_iter *iter) * Check if the iterator is already at the end of the current page * table. */ - if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == - (PT64_ENT_PER_PAGE - 1)) + if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (SPTE_ENT_PER_PAGE - 1)) return false; iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 2cd060eb34a4..8e8d412fb65c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -425,7 +425,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) tdp_mmu_unlink_sp(kvm, sp, shared); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); u64 old_spte; @@ -1477,7 +1477,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. */ - for (i = 0; i < PT64_ENT_PER_PAGE; i++) + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); /* @@ -1497,7 +1497,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * are overwriting from the page stats. But we have to manually update * the page stats with the new present child pages. */ - kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); + kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); out: trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); -- cgit v1.2.3 From f6b8ea6d43640ebfd1aeaa1faf1016d0bff7b8a0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2022 10:15:56 -0400 Subject: KVM: x86/mmu: Use common macros to compute 32/64-bit paging masks Dedup the code for generating (most of) the per-type PT_* masks in paging_tmpl.h. The relevant macros only vary based on the number of bits per level, and that smidge of info is already provided in a common form as PT_LEVEL_BITS. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614233328.3896033-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging.h | 23 ----------------------- arch/x86/kvm/mmu/paging_tmpl.h | 25 +++++++++++-------------- 2 files changed, 11 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h index 6a63727cc7e8..9de4976b2d46 100644 --- a/arch/x86/kvm/mmu/paging.h +++ b/arch/x86/kvm/mmu/paging.h @@ -5,28 +5,5 @@ #define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_LEVEL_BITS 9 - -#define PT64_INDEX(address, level) __PT_INDEX(address, level, PT64_LEVEL_BITS) - -#define PT64_LVL_ADDR_MASK(level) \ - __PT_LVL_ADDR_MASK(GUEST_PT64_BASE_ADDR_MASK, level, PT64_LEVEL_BITS) - -#define PT64_LVL_OFFSET_MASK(level) \ - __PT_LVL_OFFSET_MASK(GUEST_PT64_BASE_ADDR_MASK, level, PT64_LEVEL_BITS) - - -#define PT32_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, PT32_LEVEL_BITS) - -#define PT32_LVL_OFFSET_MASK(level) \ - __PT_LVL_OFFSET_MASK(PT32_BASE_ADDR_MASK, level, PT32_LEVEL_BITS) - -#define PT32_INDEX(address, level) __PT_INDEX(address, level, PT32_LEVEL_BITS) - -#define PT32_BASE_ADDR_MASK PAGE_MASK - -#define PT32_LVL_ADDR_MASK(level) \ - __PT_LVL_ADDR_MASK(PT32_BASE_ADDR_MASK, level, PT32_LEVEL_BITS) - #endif /* __KVM_X86_PAGING_H */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 3ec90303e2f5..61b328b0f2dc 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -16,8 +16,9 @@ */ /* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. + * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, + * as well as guest EPT tables, so the code in this file is compiled thrice, + * once per guest PTE type. The per-type defines are #undef'd at the end. */ #if PTTYPE == 64 @@ -25,10 +26,7 @@ #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true @@ -41,10 +39,7 @@ #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) + #define PT_BASE_ADDR_MASK PAGE_MASK #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT @@ -60,10 +55,7 @@ #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) @@ -72,6 +64,11 @@ #error Invalid PTTYPE value #endif +/* Common logic, but per-type values. These also need to be undefined. */ +#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) + #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) -- cgit v1.2.3 From f7384b8866b0b07f249130aa8b63135687626c5c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:33:27 +0000 Subject: KVM: x86/mmu: Truncate paging32's PT_BASE_ADDR_MASK to 32 bits Truncate paging32's PT_BASE_ADDR_MASK to a pt_element_t, i.e. to 32 bits. Ignoring PSE huge pages, the mask is only used in conjunction with gPTEs, which are 32 bits, and so the address is limited to bits 31:12. PSE huge pages encoded PA bits 39:32 in PTE bits 20:13, i.e. need custom logic to handle their funky encoding regardless of PT_BASE_ADDR_MASK. Note, PT_LVL_OFFSET_MASK is somewhat confusing in that it computes the offset of the _gfn_, not of the gpa, i.e. not having bits 63:32 set in PT_BASE_ADDR_MASK is again correct. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614233328.3896033-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 61b328b0f2dc..b7424d63c706 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -39,7 +39,7 @@ #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PAGE_MASK + #define PT_BASE_ADDR_MASK ((pt_element_t)PAGE_MASK) #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT -- cgit v1.2.3 From 70e41c31bc7776b262cd9f524df3dfc2b5869a0a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:33:28 +0000 Subject: KVM: x86/mmu: Use common logic for computing the 32/64-bit base PA mask Use common logic for computing PT_BASE_ADDR_MASK for 32-bit, 64-bit, and EPT paging. Both PAGE_MASK and the new-common logic are supsersets of what is actually needed for 32-bit paging. PAGE_MASK sets bits 63:12 and the former GUEST_PT64_BASE_ADDR_MASK sets bits 51:12, so regardless of which value is used, the result will always be bits 31:12. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614233328.3896033-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 -- arch/x86/kvm/mmu/paging.h | 9 --------- arch/x86/kvm/mmu/paging_tmpl.h | 4 +--- 3 files changed, 1 insertion(+), 14 deletions(-) delete mode 100644 arch/x86/kvm/mmu/paging.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 75f9e344812e..2dcedf04ef85 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -53,8 +53,6 @@ #include #include "trace.h" -#include "paging.h" - extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h deleted file mode 100644 index 9de4976b2d46..000000000000 --- a/arch/x86/kvm/mmu/paging.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* Shadow paging constants/helpers that don't need to be #undef'd. */ -#ifndef __KVM_X86_PAGING_H -#define __KVM_X86_PAGING_H - -#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) - -#endif /* __KVM_X86_PAGING_H */ - diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index b7424d63c706..e4655056e651 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -25,7 +25,6 @@ #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT @@ -39,7 +38,6 @@ #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK ((pt_element_t)PAGE_MASK) #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT @@ -54,7 +52,6 @@ #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 @@ -65,6 +62,7 @@ #endif /* Common logic, but per-type values. These also need to be undefined. */ +#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) #define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) -- cgit v1.2.3 From fe1911aa443ed774df46607970bed58d9769db41 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 29 Apr 2022 01:04:11 +0000 Subject: KVM: nVMX: Use kvm_vcpu_map() to get/pin vmcs12's APIC-access page Use kvm_vcpu_map() to get/pin the backing for vmcs12's APIC-access page, there's no reason it has to be restricted to 'struct page' backing. The APIC-access page actually doesn't need to be backed by anything, which is ironically why it got left behind by the series which introduced kvm_vcpu_map()[1]; the plan was to shove a dummy pfn into vmcs02[2], but that code never got merged. Switching the APIC-access page to kvm_vcpu_map() doesn't preclude using a magic pfn in the future, and will allow a future patch to drop kvm_vcpu_gpa_to_page(). [1] https://lore.kernel.org/all/1547026933-31226-1-git-send-email-karahmed@amazon.de [2] https://lore.kernel.org/lkml/1543845551-4403-1-git-send-email-karahmed@amazon.de Signed-off-by: Sean Christopherson Message-Id: <20220429010416.2788472-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 39 ++++++++++++--------------------------- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 13 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index aad938e1e51d..778f82015f03 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -311,11 +311,12 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* Unpin physical memory we referred to in the vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + /* + * Unpin physical memory we referred to in the vmcs02. The APIC access + * page's backing page (yeah, confusing) shouldn't actually be accessed, + * and if it is written, the contents are irrelevant. + */ + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; @@ -3164,8 +3165,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_host_map *map; - struct page *page; - u64 hpa; if (!vcpu->arch.pdptrs_from_userspace && !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { @@ -3180,23 +3179,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - if (!is_error_page(page)) { - vmx->nested.apic_access_page = page; - hpa = page_to_phys(vmx->nested.apic_access_page); - vmcs_write64(APIC_ACCESS_ADDR, hpa); + map = &vmx->nested.apic_access_page_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); } else { - pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", __func__); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = @@ -4632,10 +4620,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, } /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a84c91ee2a48..286c88e285ea 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -204,7 +204,7 @@ struct nested_vmx { * Guest pages referred to in the vmcs02 with host-physical * pointers, so we must keep them pinned while L2 runs. */ - struct page *apic_access_page; + struct kvm_host_map apic_access_page_map; struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; -- cgit v1.2.3 From 284dc49307738d2a897fd375431f741213cd0f27 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 29 Apr 2022 01:04:14 +0000 Subject: KVM: Take a 'struct page', not a pfn in kvm_is_zone_device_page() Operate on a 'struct page' instead of a pfn when checking if a page is a ZONE_DEVICE page, and rename the helper accordingly. Generally speaking, KVM doesn't actually care about ZONE_DEVICE memory, i.e. shouldn't do anything special for ZONE_DEVICE memory. Rather, KVM wants to treat ZONE_DEVICE memory like regular memory, and the need to identify ZONE_DEVICE memory only arises as an exception to PG_reserved pages. In other words, KVM should only ever check for ZONE_DEVICE memory after KVM has already verified that there is a struct page associated with the pfn. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220429010416.2788472-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 5 +++-- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2dcedf04ef85..b209aaf096df 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2788,15 +2788,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, const struct kvm_memory_slot *slot) { + struct page *page = pfn_to_page(pfn); + int level = PG_LEVEL_4K; unsigned long hva; unsigned long flags; - int level = PG_LEVEL_4K; pgd_t pgd; p4d_t p4d; pud_t pud; pmd_t pmd; - if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) + if (!PageCompound(page) && !kvm_is_zone_device_page(page)) return PG_LEVEL_4K; /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4334789409c0..6fb433ac2ba1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1571,7 +1571,7 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); bool kvm_is_reserved_pfn(kvm_pfn_t pfn); -bool kvm_is_zone_device_pfn(kvm_pfn_t pfn); +bool kvm_is_zone_device_page(struct page *page); struct kvm_irq_ack_notifier { struct hlist_node link; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7cd0e3d67f9f..aa1bcd5f140d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -168,7 +168,7 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } -bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) +bool kvm_is_zone_device_page(struct page *page) { /* * The metadata used by is_zone_device_page() to determine whether or @@ -176,10 +176,10 @@ bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) * the device has been pinned, e.g. by get_user_pages(). WARN if the * page_count() is zero to help detect bad usage of this helper. */ - if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) + if (WARN_ON_ONCE(!page_count(page))) return false; - return is_zone_device_page(pfn_to_page(pfn)); + return is_zone_device_page(page); } bool kvm_is_reserved_pfn(kvm_pfn_t pfn) @@ -192,7 +192,7 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) if (pfn_valid(pfn)) return PageReserved(pfn_to_page(pfn)) && !is_zero_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn); + !kvm_is_zone_device_page(pfn_to_page(pfn)); return true; } -- cgit v1.2.3 From b14b2690c50e02145bb867dfcde8845eb17aa8a4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 29 Apr 2022 01:04:15 +0000 Subject: KVM: Rename/refactor kvm_is_reserved_pfn() to kvm_pfn_to_refcounted_page() Rename and refactor kvm_is_reserved_pfn() to kvm_pfn_to_refcounted_page() to better reflect what KVM is actually checking, and to eliminate extra pfn_to_page() lookups. The kvm_release_pfn_*() an kvm_try_get_pfn() helpers in particular benefit from "refouncted" nomenclature, as it's not all that obvious why KVM needs to get/put refcounts for some PG_reserved pages (ZERO_PAGE and ZONE_DEVICE). Add a comment to call out that the list of exceptions to PG_reserved is all but guaranteed to be incomplete. The list has mostly been compiled by people throwing noodles at KVM and finding out they stick a little too well, e.g. the ZERO_PAGE's refcount overflowed and ZONE_DEVICE pages didn't get freed. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220429010416.2788472-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 ++++++----- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 66 ++++++++++++++++++++++++++++++++++++---------- 4 files changed, 63 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b209aaf096df..7ebb97c95da7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -534,6 +534,7 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; + struct page *page; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) @@ -549,11 +550,13 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) pfn = spte_to_pfn(old_spte); /* - * KVM does not hold the refcount of the page used by - * kvm mmu, before reclaiming the page, we should - * unmap it from mmu first. + * KVM doesn't hold a reference to any pages mapped into the guest, and + * instead uses the mmu_notifier to ensure that KVM unmaps any pages + * before they are reclaimed. Sanity check that, if the pfn is backed + * by a refcounted page, the refcount is elevated. */ - WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); + page = kvm_pfn_to_refcounted_page(pfn); + WARN_ON(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); @@ -2881,7 +2884,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (unlikely(fault->max_level == PG_LEVEL_4K)) return; - if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + if (is_error_noslot_pfn(fault->pfn) || !kvm_pfn_to_refcounted_page(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) @@ -5993,7 +5996,7 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + if (sp->role.direct && kvm_pfn_to_refcounted_page(pfn) && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, pfn, PG_LEVEL_NUM)) { pte_list_remove(kvm, rmap_head, sptep); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 8e8d412fb65c..1e777b739ac4 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1751,7 +1751,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, */ pfn = spte_to_pfn(iter.old_spte); - if (kvm_is_reserved_pfn(pfn)) + if (!kvm_pfn_to_refcounted_page(pfn)) continue; max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6fb433ac2ba1..a2bbdf3ab086 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1570,7 +1570,7 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); -bool kvm_is_reserved_pfn(kvm_pfn_t pfn); +struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn); bool kvm_is_zone_device_page(struct page *page); struct kvm_irq_ack_notifier { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aa1bcd5f140d..04196096f9e4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -182,19 +182,36 @@ bool kvm_is_zone_device_page(struct page *page) return is_zone_device_page(page); } -bool kvm_is_reserved_pfn(kvm_pfn_t pfn) +/* + * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted + * page, NULL otherwise. Note, the list of refcounted PG_reserved page types + * is likely incomplete, it has been compiled purely through people wanting to + * back guest with a certain type of memory and encountering issues. + */ +struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn) { + struct page *page; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + if (!PageReserved(page)) + return page; + + /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */ + if (is_zero_pfn(pfn)) + return page; + /* * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting * perspective they are "normal" pages, albeit with slightly different * usage rules. */ - if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)) && - !is_zero_pfn(pfn) && - !kvm_is_zone_device_page(pfn_to_page(pfn)); + if (kvm_is_zone_device_page(page)) + return page; - return true; + return NULL; } /* @@ -2501,9 +2518,12 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) static int kvm_try_get_pfn(kvm_pfn_t pfn) { - if (kvm_is_reserved_pfn(pfn)) + struct page *page = kvm_pfn_to_refcounted_page(pfn); + + if (!page) return 1; - return get_page_unless_zero(pfn_to_page(pfn)); + + return get_page_unless_zero(page); } static int hva_to_pfn_remapped(struct vm_area_struct *vma, @@ -2728,6 +2748,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); */ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { + struct page *page; kvm_pfn_t pfn; pfn = gfn_to_pfn(kvm, gfn); @@ -2735,10 +2756,11 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) if (is_error_noslot_pfn(pfn)) return KVM_ERR_PTR_BAD_PAGE; - if (kvm_is_reserved_pfn(pfn)) + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) return KVM_ERR_PTR_BAD_PAGE; - return pfn_to_page(pfn); + return page; } EXPORT_SYMBOL_GPL(gfn_to_page); @@ -2841,8 +2863,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(kvm_pfn_t pfn) { - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) - kvm_release_page_clean(pfn_to_page(pfn)); + struct page *page; + + if (is_error_noslot_pfn(pfn)) + return; + + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) + return; + + kvm_release_page_clean(page); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); @@ -2857,8 +2887,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_dirty); void kvm_release_pfn_dirty(kvm_pfn_t pfn) { - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) - kvm_release_page_dirty(pfn_to_page(pfn)); + struct page *page; + + if (is_error_noslot_pfn(pfn)) + return; + + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) + return; + + kvm_release_page_dirty(page); } EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); -- cgit v1.2.3 From 5d49f08c2e08c1f0de1bb0f2e1307ec969451729 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 29 Apr 2022 01:04:16 +0000 Subject: KVM: x86/mmu: Shove refcounted page dependency into host_pfn_mapping_level() Move the check that restricts mapping huge pages into the guest to pfns that are backed by refcounted 'struct page' memory into the helper that actually "requires" a 'struct page', host_pfn_mapping_level(). In addition to deduplicating code, moving the check to the helper eliminates the subtle requirement that the caller check that the incoming pfn is backed by a refcounted struct page, and as an added bonus avoids an extra pfn_to_page() lookup. Note, the is_error_noslot_pfn() check in kvm_mmu_hugepage_adjust() needs to stay where it is, as it guards against dereferencing a NULL memslot in the kvm_slot_dirty_track_enabled() that follows. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220429010416.2788472-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 14 +++++++++++--- arch/x86/kvm/mmu/tdp_mmu.c | 4 ---- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7ebb97c95da7..27b2a5603496 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2791,8 +2791,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, const struct kvm_memory_slot *slot) { - struct page *page = pfn_to_page(pfn); int level = PG_LEVEL_4K; + struct page *page; unsigned long hva; unsigned long flags; pgd_t pgd; @@ -2800,6 +2800,14 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, pud_t pud; pmd_t pmd; + /* + * Note, @slot must be non-NULL, i.e. the caller is responsible for + * ensuring @pfn isn't garbage and is backed by a memslot. + */ + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) + return PG_LEVEL_4K; + if (!PageCompound(page) && !kvm_is_zone_device_page(page)) return PG_LEVEL_4K; @@ -2884,7 +2892,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (unlikely(fault->max_level == PG_LEVEL_4K)) return; - if (is_error_noslot_pfn(fault->pfn) || !kvm_pfn_to_refcounted_page(fault->pfn)) + if (is_error_noslot_pfn(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) @@ -5996,7 +6004,7 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && kvm_pfn_to_refcounted_page(pfn) && + if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, pfn, PG_LEVEL_NUM)) { pte_list_remove(kvm, rmap_head, sptep); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1e777b739ac4..1ea40809ef1f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1750,10 +1750,6 @@ static void zap_collapsible_spte_range(struct kvm *kvm, * be mapped at a higher level. */ pfn = spte_to_pfn(iter.old_spte); - - if (!kvm_pfn_to_refcounted_page(pfn)) - continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, pfn, PG_LEVEL_NUM); -- cgit v1.2.3 From e20918f6d11253d62b110e8d16b17cc9bf82d832 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Tue, 14 Jun 2022 21:34:58 +0800 Subject: x86: kvm: remove NULL check before kfree kfree can handle NULL pointer as its argument. According to coccinelle isnullfree check, remove NULL check before kfree operation. Signed-off-by: Dongliang Mu Message-Id: <20220614133458.147314-1-dzm91@hust.edu.cn> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1a3658f7e6d9..d4e48b4a438b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -236,8 +236,7 @@ again: raw_spin_unlock(&b->lock); /* A dummy token might be allocated and ultimately not used. */ - if (dummy) - kfree(dummy); + kfree(dummy); } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); -- cgit v1.2.3 From 9fc222967a39d6be96dabb942cc94e4f07ca049c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:49 +0000 Subject: KVM: x86: Give host userspace full control of MSR_IA32_MISC_ENABLES Give userspace full control of the read-only bits in MISC_ENABLES, i.e. do not modify bits on PMU refresh and do not preserve existing bits when userspace writes MISC_ENABLES. With a few exceptions where KVM doesn't expose the necessary controls to userspace _and_ there is a clear cut association with CPUID, e.g. reserved CR4 bits, KVM does not own the vCPU and should not manipulate the vCPU model on behalf of "dummy user space". The argument that KVM is doing userspace a favor because "the order of setting vPMU capabilities and MSR_IA32_MISC_ENABLE is not strictly guaranteed" is specious, as attempting to configure MSRs on behalf of userspace inevitably leads to edge cases precisely because KVM does not prescribe a specific order of initialization. Example #1: intel_pmu_refresh() consumes and modifies the vCPU's MSR_IA32_PERF_CAPABILITIES, and so assumes userspace initializes config MSRs before setting the guest CPUID model. If userspace sets CPUID first, then KVM will mark PEBS as available when arch.perf_capabilities is initialized with a non-zero PEBS format, thus creating a bad vCPU model if userspace later disables PEBS by writing PERF_CAPABILITIES. Example #2: intel_pmu_refresh() does not clear PERF_CAP_PEBS_MASK in MSR_IA32_PERF_CAPABILITIES if there is no vPMU, making KVM inconsistent in its desire to be consistent. Example #3: intel_pmu_refresh() does not clear MSR_IA32_MISC_ENABLE_EMON if KVM_SET_CPUID2 is called multiple times, first with a vPMU, then without a vPMU. While slightly contrived, it's plausible a VMM could reflect KVM's default vCPU and then operate on KVM's copy of CPUID to later clear the vPMU settings, e.g. see KVM's selftests. Example #4: Enumerating an Intel vCPU on an AMD host will not call into intel_pmu_refresh() at any point, and so the BTS and PEBS "unavailable" bits will be left clear, without any way for userspace to set them. Keep the "R" behavior of the bit 7, "EMON available", for the guest. Unlike the BTS and PEBS bits, which are fully "RO", the EMON bit can be written with a different value, but that new value is ignored. Cc: Like Xu Signed-off-by: Sean Christopherson Reported-by: kernel test robot Message-Id: <20220611005755.753273-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 5 ----- arch/x86/kvm/x86.c | 24 +++++++++++------------- 2 files changed, 11 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 422f0a6562ac..3b324ce0b142 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -536,8 +536,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; - vcpu->arch.ia32_misc_enable_msr |= MSR_IA32_MISC_ENABLE_PMU_RO_MASK; - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry || !vcpu->kvm->arch.enable_pmu) return; @@ -548,8 +546,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; - vcpu->arch.ia32_misc_enable_msr |= MSR_IA32_MISC_ENABLE_EMON; - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, kvm_pmu_cap.num_counters_gp); eax.split.bit_width = min_t(int, eax.split.bit_width, @@ -611,7 +607,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) { - vcpu->arch.ia32_misc_enable_msr &= ~MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_mask = counter_mask; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00e23dc518e0..67bdd7e20534 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3550,21 +3550,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_MISC_ENABLE: { u64 old_val = vcpu->arch.ia32_misc_enable_msr; - u64 pmu_mask = MSR_IA32_MISC_ENABLE_PMU_RO_MASK | - MSR_IA32_MISC_ENABLE_EMON; - /* RO bits */ - if (!msr_info->host_initiated && - ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)) - return 1; + if (!msr_info->host_initiated) { + /* RO bits */ + if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) + return 1; + + /* R bits, i.e. writes are ignored, but don't fault. */ + data = data & ~MSR_IA32_MISC_ENABLE_EMON; + data |= old_val & MSR_IA32_MISC_ENABLE_EMON; + } - /* - * For a dummy user space, the order of setting vPMU capabilities and - * initialising MSR_IA32_MISC_ENABLE is not strictly guaranteed, so to - * avoid inconsistent functionality we keep the vPMU bits unchanged here. - */ - data &= ~pmu_mask; - data |= old_val & pmu_mask; if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) @@ -11573,6 +11569,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.smbase = 0x30000; vcpu->arch.msr_misc_features_enables = 0; + vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); -- cgit v1.2.3 From 0f4a7185270c4879fa40b33d7e07b6ac38353b34 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:50 +0000 Subject: KVM: VMX: Give host userspace full control of MSR_IA32_PERF_CAPABILITIES Do not clear manipulate MSR_IA32_PERF_CAPABILITIES in intel_pmu_refresh(), i.e. give userspace full control over capability/read-only MSRs. KVM is not a babysitter, it is userspace's responsiblity to provide a valid and coherent vCPU model. Attempting to "help" the guest by forcing a consistent model creates edge cases, and ironicially leads to inconsistent behavior. Example #1: KVM doesn't do intel_pmu_refresh() when userspace writes the MSR. Example #2: KVM doesn't clear the bits when the PMU is disabled, or when there's no architectural PMU. Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 3b324ce0b142..b62012766226 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -619,8 +619,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_enable_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); } - } else { - vcpu->arch.perf_capabilities &= ~PERF_CAP_PEBS_MASK; } } -- cgit v1.2.3 From 5d4283df5a0fc8299fba9443c33d219939eccc2d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:51 +0000 Subject: Revert "KVM: x86/pmu: Accept 0 for absent PMU MSRs when host-initiated if !enable_pmu" Eating reads and writes to all "PMU" MSRs when there is no PMU is wildly broken as it results in allowing accesses to _any_ MSR on Intel CPUs as intel_is_valid_msr() returns true for all host_initiated accesses. A revert of commit d1c88a402056 ("KVM: x86: always allow host-initiated writes to PMU MSRs") will soon follow. This reverts commit 8e6a58e28b34e8d247e772159b8fa8f6bae39192. Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 8 -------- arch/x86/kvm/svm/pmu.c | 11 +---------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 6a32092460d3..87483e503c46 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -442,19 +442,11 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - if (msr_info->host_initiated && !vcpu->kvm->arch.enable_pmu) { - msr_info->data = 0; - return 0; - } - return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - if (msr_info->host_initiated && !vcpu->kvm->arch.enable_pmu) - return !!msr_info->data; - kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index fe520b2649b5..256244b8f89c 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -182,16 +182,7 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) { /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ - if (!host_initiated) - return false; - - switch (msr) { - case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - return true; - default: - return false; - } + return false; } static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) -- cgit v1.2.3 From 545feb96c052809dab5ec04b95f976acca9f9364 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:52 +0000 Subject: Revert "KVM: x86: always allow host-initiated writes to PMU MSRs" Revert the hack to allow host-initiated accesses to all "PMU" MSRs, as intel_is_valid_msr() returns true for _all_ MSRs, regardless of whether or not it has a snowball's chance in hell of actually being a PMU MSR. That mostly gets papered over by the actual get/set helpers only handling MSRs that they knows about, except there's the minor detail that kvm_pmu_{g,s}et_msr() eat reads and writes when the PMU is disabled. I.e. KVM will happy allow reads and writes to _any_ MSR if the PMU is disabled, either via module param or capability. This reverts commit d1c88a4020567ba4da52f778bcd9619d87e4ea75. Fixes: d1c88a402056 ("KVM: x86: always allow host-initiated writes to PMU MSRs") Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 4 ++-- arch/x86/kvm/pmu.h | 4 ++-- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 27 ++++++++++----------------- arch/x86/kvm/x86.c | 10 +++++----- 5 files changed, 20 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 87483e503c46..02f9e4f245bd 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -425,10 +425,10 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) } } -bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || - static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr, host_initiated); + static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index c1b61671ba1e..5cc5721f260b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -32,7 +32,7 @@ struct kvm_pmu_ops { unsigned int idx, u64 *mask); struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); - bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated); + bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void (*refresh)(struct kvm_vcpu *vcpu); @@ -189,7 +189,7 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); -bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated); +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 256244b8f89c..f24613a108c5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -179,7 +179,7 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[idx]; } -static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ return false; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b62012766226..b1aae60cf061 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -196,45 +196,38 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) return ret; } -static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) +static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); u64 perf_capabilities = vcpu->arch.perf_capabilities; + int ret; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: case MSR_CORE_PERF_GLOBAL_STATUS: case MSR_CORE_PERF_GLOBAL_CTRL: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (host_initiated) - return true; - return pmu->version > 1; + ret = pmu->version > 1; break; case MSR_IA32_PEBS_ENABLE: - if (host_initiated) - return true; - return perf_capabilities & PERF_CAP_PEBS_FORMAT; + ret = perf_capabilities & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: - if (host_initiated) - return true; - return guest_cpuid_has(vcpu, X86_FEATURE_DS); + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); break; case MSR_PEBS_DATA_CFG: - if (host_initiated) - return true; - return (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; default: - if (host_initiated) - return true; - return get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) || intel_pmu_is_valid_lbr_msr(vcpu, msr); break; } + + return ret; } static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) @@ -596,7 +589,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); nested_vmx_pmu_refresh(vcpu, - intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, false)); + intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 67bdd7e20534..46301b148f64 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3704,7 +3704,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) @@ -3787,7 +3787,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } @@ -3867,7 +3867,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); if (!msr_info->host_initiated) return 1; @@ -3877,7 +3877,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; @@ -4123,7 +4123,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } -- cgit v1.2.3 From 3f7999b988bde6c50cb7b20d6c742d6512d1f0bd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:53 +0000 Subject: KVM: VMX: Use vcpu_get_perf_capabilities() to get guest-visible value Use vcpu_get_perf_capabilities() when querying MSR_IA32_PERF_CAPABILITIES from the guest's perspective, e.g. to update the vPMU and to determine which MSRs exist. If userspace ignores MSR_IA32_PERF_CAPABILITIES but clear X86_FEATURE_PDCM, the guest should see '0'. Fixes: 902caeb6841a ("KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS") Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b1aae60cf061..53ccba896e77 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -199,7 +199,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - u64 perf_capabilities = vcpu->arch.perf_capabilities; + u64 perf_capabilities; int ret; switch (msr) { @@ -210,12 +210,13 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) ret = pmu->version > 1; break; case MSR_IA32_PEBS_ENABLE: - ret = perf_capabilities & PERF_CAP_PEBS_FORMAT; + ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); break; case MSR_PEBS_DATA_CFG: + perf_capabilities = vcpu_get_perf_capabilities(vcpu); ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; @@ -515,6 +516,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 perf_capabilities; u64 counter_mask; int i; @@ -599,8 +601,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); - if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) { - if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) { + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_mask = counter_mask; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { -- cgit v1.2.3 From 157fc497b54fd1dfcdedbca6199adca4bf5ee6ff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:54 +0000 Subject: KVM: x86: Ignore benign host accesses to "unsupported" PEBS and BTS MSRs Ignore host userspace reads and writes of '0' to PEBS and BTS MSRs that KVM reports in the MSR-to-save list, but the MSRs are ultimately unsupported. All MSRs in said list must be writable by userspace, e.g. if userspace sends the list back at KVM without filtering out the MSRs it doesn't need. Fixes: 8183a538cd95 ("KVM: x86/pmu: Add IA32_DS_AREA MSR emulation to support guest DS") Fixes: 902caeb6841a ("KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS") Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 46301b148f64..e07c5765a551 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3786,6 +3786,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + /* + * Userspace is allowed to write '0' to MSRs that KVM reports + * as to-be-saved, even if an MSRs isn't fully supported. + */ + return !msr_info->host_initiated || data; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -3866,9 +3876,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + /* + * Userspace is allowed to read MSRs that KVM reports in + * KVM_GET_MSR_INDEX_LIST, even if an MSR isn't fully supported. + */ if (!msr_info->host_initiated) return 1; msr_info->data = 0; -- cgit v1.2.3 From ff81a90f45ce6b818167c590f7625b3b573defc9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:55 +0000 Subject: KVM: x86: Ignore benign host writes to "unsupported" F15H_PERF_CTL MSRs Ignore host userspace writes of '0' to F15H_PERF_CTL MSRs KVM reports in the MSR-to-save list, but the MSRs are ultimately unsupported. All MSRs in said list must be writable by userspace, e.g. if userspace sends the list back at KVM without filtering out the MSRs it doesn't need. Note, reads of said MSRs already have the desired behavior. Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e07c5765a551..b70fe8560881 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3789,6 +3789,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_PEBS_ENABLE: case MSR_IA32_DS_AREA: case MSR_PEBS_DATA_CFG: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); /* -- cgit v1.2.3 From bfbcc81bb82cbbad8bf4e40cea274f42b50674e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jun 2022 22:45:12 +0000 Subject: KVM: x86: Add a quirk for KVM's "MONITOR/MWAIT are NOPs!" behavior Add a quirk for KVM's behavior of emulating intercepted MONITOR/MWAIT instructions a NOPs regardless of whether or not they are supported in guest CPUID. KVM's current behavior was likely motiviated by a certain fruity operating system that expects MONITOR/MWAIT to be supported unconditionally and blindly executes MONITOR/MWAIT without first checking CPUID. And because KVM does NOT advertise MONITOR/MWAIT to userspace, that's effectively the default setup for any VMM that regurgitates KVM_GET_SUPPORTED_CPUID to KVM_SET_CPUID2. Note, this quirk interacts with KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT. The behavior is actually desirable, as userspace VMMs that want to unconditionally hide MONITOR/MWAIT from the guest can leave the MISC_ENABLE quirk enabled. Signed-off-by: Sean Christopherson Message-Id: <20220608224516.3788274-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 13 +++++++++++++ arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/x86.c | 30 +++++++++++++++++++----------- 4 files changed, 35 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 84c486ce6279..320cb04f7bd9 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7522,6 +7522,19 @@ The valid bits in cap.args[0] are: hypercall instructions. Executing the incorrect hypercall instruction will generate a #UD within the guest. + +KVM_X86_QUIRK_MWAIT_NEVER_FAULTS By default, KVM emulates MONITOR/MWAIT (if + they are intercepted) as NOPs regardless of + whether or not MONITOR/MWAIT are supported + according to guest CPUID. When this quirk + is disabled and KVM_X86_DISABLE_EXITS_MWAIT + is not set (MONITOR/MWAIT are intercepted), + KVM will inject a #UD on MONITOR/MWAIT if + they're unsupported per guest CPUID. Note, + KVM will modify MONITOR/MWAIT support in + guest CPUID on writes to MISC_ENABLE if + KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is + disabled. =================================== ============================================ 7.32 KVM_CAP_MAX_VCPU_ID diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1038ccb7056a..e37727a74d0a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2076,6 +2076,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ - KVM_X86_QUIRK_FIX_HYPERCALL_INSN) + KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ + KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 50a4e787d5e6..ee3896416c68 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -439,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) +#define KVM_X86_QUIRK_MWAIT_NEVER_FAULTS (1 << 6) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b70fe8560881..9b9f9f65c548 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2036,13 +2036,6 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); -int kvm_emulate_mwait(struct kvm_vcpu *vcpu) -{ - pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); - return kvm_emulate_as_nop(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); - int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -2050,11 +2043,26 @@ int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); -int kvm_emulate_monitor(struct kvm_vcpu *vcpu) + +static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) && + !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + return kvm_handle_invalid_op(vcpu); + + pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); +} EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) @@ -3884,8 +3892,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); /* - * Userspace is allowed to read MSRs that KVM reports in - * KVM_GET_MSR_INDEX_LIST, even if an MSR isn't fully supported. + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. */ if (!msr_info->host_initiated) return 1; -- cgit v1.2.3 From 1c4dc57328bf218e999951824dce75c6125c4f3c Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:20 +0000 Subject: KVM: x86: Fix errant brace in KVM capability handling The braces around the KVM_CAP_XSAVE2 block also surround the KVM_CAP_PMU_CAPABILITY block, likely the result of a merge issue. Simply move the curly brace back to where it belongs. Fixes: ba7bb663f5547 ("KVM: x86: Provide per VM capability for disabling PMU virtualization") Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-8-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b9f9f65c548..c1b3b2ea8ee0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4420,10 +4420,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; + } case KVM_CAP_PMU_CAPABILITY: r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; - } case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; -- cgit v1.2.3 From 084cc29f8bbb034cf30a7ee07a816c115e0c28df Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:21 +0000 Subject: KVM: x86/MMU: Allow NX huge pages to be disabled on a per-vm basis In some cases, the NX hugepage mitigation for iTLB multihit is not needed for all guests on a host. Allow disabling the mitigation on a per-VM basis to avoid the performance hit of NX hugepages on trusted workloads. In order to disable NX hugepages on a VM, ensure that the userspace actor has permission to reboot the system. Since disabling NX hugepages would allow a guest to crash the system, it is similar to reboot permissions. Ideally, KVM would require userspace to prove it has access to KVM's nx_huge_pages module param, e.g. so that userspace can opt out without needing full reboot permissions. But getting access to the module param file info is difficult because it is buried in layers of sysfs and module glue. Requiring CAP_SYS_BOOT is sufficient for all known use cases. Suggested-by: Jim Mattson Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-9-bgardon@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 16 ++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu/mmu_internal.h | 7 ++++--- arch/x86/kvm/mmu/spte.c | 7 ++++--- arch/x86/kvm/mmu/spte.h | 3 ++- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 8 files changed, 60 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 320cb04f7bd9..bafaeedd455c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8206,6 +8206,22 @@ PV guests. The `KVM_PV_DUMP` command is available for the dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is available and supports the `KVM_PV_DUMP_CPU` subcommand. +8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES +--------------------------- + +:Capability KVM_CAP_VM_DISABLE_NX_HUGE_PAGES +:Architectures: x86 +:Type: vm +:Parameters: arg[0] must be 0. +:Returns 0 on success, -EPERM if the userspace process does not + have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been + created. + +This capability disables the NX huge pages mitigation for iTLB MULTIHIT. + +The capability has no effect if the nx_huge_pages module parameter is not set. + +This capability may only be set before any vCPUs are created. 9. Known KVM API problems ========================= diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e37727a74d0a..7e4c31b57a75 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1336,6 +1336,8 @@ struct kvm_arch { * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. */ u32 max_vcpu_ids; + + bool disable_nx_huge_pages; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5e1e3c8f8aaa..bb9d12ac0db3 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -155,9 +155,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) +static inline bool is_nx_huge_page_enabled(struct kvm *kvm) { - return READ_ONCE(nx_huge_pages); + return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages; } struct kvm_page_fault { @@ -256,7 +256,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .user = err & PFERR_USER_MASK, .prefetch = prefetch, .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + .nx_huge_page_workaround_enabled = + is_nx_huge_page_enabled(vcpu->kvm), .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 242e4828d7df..db294c1beea2 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -147,7 +147,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { + is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } @@ -246,7 +246,8 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level, + int index) { u64 child_spte; int child_level; @@ -274,7 +275,7 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) * When splitting to a 4K page, mark the page executable as the * NX hugepage mitigation no longer applies. */ - if (is_nx_huge_page_enabled()) + if (is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 121c5eaaec77..256f90587e8d 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -421,7 +421,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level, + int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1ea40809ef1f..522e2532343b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1478,7 +1478,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * not been linked in yet and thus is not reachable from any other CPU. */ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, level, i); /* * Replace the huge spte with a pointer to the populated lower level diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1b3b2ea8ee0..7ce0c6fe166d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4324,6 +4324,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SYS_ATTRIBUTES: case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -6184,6 +6185,35 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7569b4ec199c..a36e78710382 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1166,6 +1166,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_PROTECTED_DUMP 217 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 #define KVM_CAP_X86_NOTIFY_VMEXIT 219 +#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From bb924ca69f71b4f54c8f07be28e2b76f5ad1d2ac Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:48 -0400 Subject: KVM: x86/mmu: Optimize MMU page cache lookup for all direct SPs Commit fb58a9c345f6 ("KVM: x86/mmu: Optimize MMU page cache lookup for fully direct MMUs") skipped the unsync checks and write flood clearing for full direct MMUs. We can extend this further to skip the checks for all direct shadow pages. Direct shadow pages in indirect MMUs (i.e. shadow paging) are used when shadowing a guest huge page with smaller pages. Such direct shadow pages, like their counterparts in fully direct MMUs, are never marked unsynced or have a non-zero write-flooding count. Checking sp->role.direct also generates better code than checking direct_map because, due to register pressure, direct_map has to get shoved onto the stack and then pulled back off. No functional change intended. Reviewed-by: Lai Jiangshan Reviewed-by: Sean Christopherson Reviewed-by: Peter Xu Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 27b2a5603496..c0afb4f1c8ae 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2000,7 +2000,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int direct, unsigned int access) { - bool direct_mmu = vcpu->arch.mmu->root_role.direct; union kvm_mmu_page_role role; struct hlist_head *sp_list; unsigned quadrant; @@ -2060,7 +2059,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, continue; } - if (direct_mmu) + /* unsync and write-flooding only apply to indirect SPs. */ + if (sp->role.direct) goto trace_get_page; if (sp->unsync) { -- cgit v1.2.3 From 27a59d57f073f21f029df1517c2c0a1abea5b0ce Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:49 -0400 Subject: KVM: x86/mmu: Use a bool for direct The parameter "direct" can either be true or false, and all of the callers pass in a bool variable or true/false literal, so just use the type bool. No functional change intended. Reviewed-by: Lai Jiangshan Reviewed-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c0afb4f1c8ae..844b58ddb3bb 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1664,7 +1664,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, bool direct) { struct kvm_mmu_page *sp; @@ -1997,7 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, unsigned level, - int direct, + bool direct, unsigned int access) { union kvm_mmu_page_role role; -- cgit v1.2.3 From 86938ab6925b8fe174ca6abf397e6ea9d3c054a4 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:50 -0400 Subject: KVM: x86/mmu: Stop passing "direct" to mmu_alloc_root() The "direct" argument is vcpu->arch.mmu->root_role.direct, because unlike non-root page tables, it's impossible to have a direct root in an indirect MMU. So just use that. Suggested-by: Lai Jiangshan Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 844b58ddb3bb..2e30398fe59f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3369,8 +3369,9 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) } static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, - u8 level, bool direct) + u8 level) { + bool direct = vcpu->arch.mmu->root_role.direct; struct kvm_mmu_page *sp; sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); @@ -3396,7 +3397,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { @@ -3408,7 +3409,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), - i << 30, PT32_ROOT_LEVEL, true); + i << 30, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } @@ -3532,7 +3533,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) */ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - mmu->root_role.level, false); + mmu->root_role.level); mmu->root.hpa = root; goto set_root_pgd; } @@ -3578,7 +3579,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } root = mmu_alloc_root(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, false); + PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } -- cgit v1.2.3 From 2e65e842c57d72e9a573ba42bc2055b7f626ea1f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:51 -0400 Subject: KVM: x86/mmu: Derive shadow MMU page role from parent Instead of computing the shadow page role from scratch for every new page, derive most of the information from the parent shadow page. This eliminates the dependency on the vCPU root role to allocate shadow page tables, and reduces the number of parameters to kvm_mmu_get_page(). Preemptively split out the role calculation to a separate function for use in a following commit. Note that when calculating the MMU root role, we can take @role.passthrough, @role.direct, and @role.access directly from @vcpu->arch.mmu->root_role. Only @role.level and @role.quadrant still must be overridden for PAE page directories, when shadowing 32-bit guest page tables with PAE page tables. No functional change intended. Reviewed-by: Peter Xu Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 114 ++++++++++++++++++++++++----------------- arch/x86/kvm/mmu/paging_tmpl.h | 9 ++-- 2 files changed, 71 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2e30398fe59f..fd1b479bf7fc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1993,49 +1993,15 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sptep_to_sp(spte)); } -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - bool direct, - unsigned int access) +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, + union kvm_mmu_page_role role) { - union kvm_mmu_page_role role; struct hlist_head *sp_list; - unsigned quadrant; struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu->root_role; - role.level = level; - role.direct = direct; - role.access = access; - if (role.has_4_byte_gpte) { - /* - * If the guest has 4-byte PTEs then that means it's using 32-bit, - * 2-level, non-PAE paging. KVM shadows such guests with PAE paging - * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must - * shadow each guest page table with multiple shadow page tables, which - * requires extra bookkeeping in the role. - * - * Specifically, to shadow the guest's page directory (which covers a - * 4GiB address space), KVM uses 4 PAE page directories, each mapping - * 1GiB of the address space. @role.quadrant encodes which quarter of - * the address space each maps. - * - * To shadow the guest's page tables (which each map a 4MiB region), KVM - * uses 2 PAE page tables, each mapping a 2MiB region. For these, - * @role.quadrant encodes which half of the region they map. - */ - quadrant = gaddr >> (PAGE_SHIFT + (SPTE_LEVEL_BITS * level)); - quadrant &= (1 << level) - 1; - role.quadrant = quadrant; - } - if (level <= vcpu->arch.mmu->cpu_role.base.level) - role.passthrough = 0; - sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; for_each_valid_sp(vcpu->kvm, sp, sp_list) { if (sp->gfn != gfn) { @@ -2053,7 +2019,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ - if (level > PG_LEVEL_4K && sp->unsync) + if (role.level > PG_LEVEL_4K && sp->unsync) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); continue; @@ -2094,14 +2060,14 @@ trace_get_page: ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, direct); + sp = kvm_mmu_alloc_page(vcpu, role.direct); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); if (sp_has_gptes(sp)) { account_shadowed(vcpu->kvm, sp); - if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) + if (role.level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); } trace_kvm_mmu_get_page(sp, true); @@ -2113,6 +2079,55 @@ out: return sp; } +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsigned int access) +{ + struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); + union kvm_mmu_page_role role; + + role = parent_sp->role; + role.level--; + role.access = access; + role.direct = direct; + role.passthrough = 0; + + /* + * If the guest has 4-byte PTEs then that means it's using 32-bit, + * 2-level, non-PAE paging. KVM shadows such guests with PAE paging + * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must + * shadow each guest page table with multiple shadow page tables, which + * requires extra bookkeeping in the role. + * + * Specifically, to shadow the guest's page directory (which covers a + * 4GiB address space), KVM uses 4 PAE page directories, each mapping + * 1GiB of the address space. @role.quadrant encodes which quarter of + * the address space each maps. + * + * To shadow the guest's page tables (which each map a 4MiB region), KVM + * uses 2 PAE page tables, each mapping a 2MiB region. For these, + * @role.quadrant encodes which half of the region they map. + * + * Note, the 4 PAE page directories are pre-allocated and the quadrant + * assigned in mmu_alloc_root(). So only page tables need to be handled + * here. + */ + if (role.has_4_byte_gpte) { + WARN_ON_ONCE(role.level != PG_LEVEL_4K); + role.quadrant = (sptep - parent_sp->spt) % 2; + } + + return role; +} + +static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) +{ + union kvm_mmu_page_role role; + + role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_page(vcpu, gfn, role); +} + static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) @@ -2964,8 +2979,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (is_shadow_present_pte(*it.sptep)) continue; - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); if (fault->is_tdp && fault->huge_page_disallowed && @@ -3368,13 +3382,18 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, u8 level) { - bool direct = vcpu->arch.mmu->root_role.direct; + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm_mmu_page *sp; - sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); + role.level = level; + + if (role.has_4_byte_gpte) + role.quadrant = quadrant; + + sp = kvm_mmu_get_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); @@ -3408,8 +3427,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); - root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), - i << 30, PT32_ROOT_LEVEL); + root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i, + PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } @@ -3578,8 +3597,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_gfn = pdptrs[i] >> PAGE_SHIFT; } - root = mmu_alloc_root(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL); + root = mmu_alloc_root(vcpu, root_gfn, i, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index e4655056e651..6ecdd7a41a82 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -654,8 +654,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, - it.level-1, false, access); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, + false, access); + /* * We must synchronize the pagetable before linking it * because the guest doesn't need to flush tlb when @@ -711,8 +712,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, drop_large_spte(vcpu, it.sptep); if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, - it.level - 1, true, direct_access); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, + true, direct_access); link_shadow_page(vcpu, it.sptep, sp); if (fault->huge_page_disallowed && fault->req_level >= it.level) -- cgit v1.2.3 From 7f49777550e55a7d6832cbb0873f48f91c175b9c Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:52 -0400 Subject: KVM: x86/mmu: Always pass 0 for @quadrant when gptes are 8 bytes The quadrant is only used when gptes are 4 bytes, but mmu_alloc_{direct,shadow}_roots() pass in a non-zero quadrant for PAE page directories regardless. Make this less confusing by only passing in a non-zero quadrant when it is actually necessary. Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-6-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fd1b479bf7fc..f4e7978a6c6a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3389,9 +3389,10 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, struct kvm_mmu_page *sp; role.level = level; + role.quadrant = quadrant; - if (role.has_4_byte_gpte) - role.quadrant = quadrant; + WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); + WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); sp = kvm_mmu_get_page(vcpu, gfn, role); ++sp->root_count; @@ -3427,7 +3428,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); - root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i, + root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; @@ -3512,9 +3513,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; + int quadrant, i, r; hpa_t root; - unsigned i; - int r; root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; @@ -3597,7 +3597,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_gfn = pdptrs[i] >> PAGE_SHIFT; } - root = mmu_alloc_root(vcpu, root_gfn, i, PT32_ROOT_LEVEL); + /* + * If shadowing 32-bit non-PAE page tables, each PAE page + * directory maps one quarter of the guest's non-PAE page + * directory. Othwerise each PAE page direct shadows one guest + * PAE page directory so that quadrant should be 0. + */ + quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; + + root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } -- cgit v1.2.3 From 94c8136448c80496cbfe0922bcb379bcf62cb8ac Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:53 -0400 Subject: KVM: x86/mmu: Decompose kvm_mmu_get_page() into separate functions Decompose kvm_mmu_get_page() into separate helper functions to increase readability and prepare for allocating shadow pages without a vcpu pointer. Specifically, pull the guts of kvm_mmu_get_page() into 2 helper functions: kvm_mmu_find_shadow_page() - Walks the page hash checking for any existing mmu pages that match the given gfn and role. kvm_mmu_alloc_shadow_page() Allocates and initializes an entirely new kvm_mmu_page. This currently requries a vcpu pointer for allocation and looking up the memslot but that will be removed in a future commit. No functional change intended. Reviewed-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-7-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 52 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f4e7978a6c6a..a59fe860da29 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1993,16 +1993,16 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sptep_to_sp(spte)); } -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, - union kvm_mmu_page_role role) +static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) { - struct hlist_head *sp_list; struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); - sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; for_each_valid_sp(vcpu->kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; @@ -2027,7 +2027,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, /* unsync and write-flooding only apply to indirect SPs. */ if (sp->role.direct) - goto trace_get_page; + goto out; if (sp->unsync) { /* @@ -2053,14 +2053,26 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, __clear_sp_write_flooding_count(sp); -trace_get_page: - trace_kvm_mmu_get_page(sp, false); goto out; } + sp = NULL; ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, role.direct); +out: + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + + if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) + vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; + return sp; +} + +static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) +{ + struct kvm_mmu_page *sp = kvm_mmu_alloc_page(vcpu, role.direct); sp->gfn = gfn; sp->role = role; @@ -2070,12 +2082,26 @@ trace_get_page: if (role.level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); } - trace_kvm_mmu_get_page(sp, true); -out: - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) - vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; + return sp; +} + +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct hlist_head *sp_list; + struct kvm_mmu_page *sp; + bool created = false; + + sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + + sp = kvm_mmu_find_shadow_page(vcpu, gfn, sp_list, role); + if (!sp) { + created = true; + sp = kvm_mmu_alloc_shadow_page(vcpu, gfn, sp_list, role); + } + + trace_kvm_mmu_get_page(sp, created); return sp; } -- cgit v1.2.3 From c306aec81ae1f40af42bf531065b66308ff72251 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:54 -0400 Subject: KVM: x86/mmu: Consolidate shadow page allocation and initialization Consolidate kvm_mmu_alloc_page() and kvm_mmu_alloc_shadow_page() under the latter so that all shadow page allocation and initialization happens in one place. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-8-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a59fe860da29..8b84cdd8c6cd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1664,27 +1664,6 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, bool direct) -{ - struct kvm_mmu_page *sp; - - sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); - if (!direct) - sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() - * depends on valid pages being added to the head of the list. See - * comments in kvm_zap_obsolete_pages(). - */ - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); - return sp; -} - static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { @@ -2072,7 +2051,23 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm_vcpu *vcpu, struct hlist_head *sp_list, union kvm_mmu_page_role role) { - struct kvm_mmu_page *sp = kvm_mmu_alloc_page(vcpu, role.direct); + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + if (!role.direct) + sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); + + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + /* + * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() + * depends on valid pages being added to the head of the list. See + * comments in kvm_zap_obsolete_pages(). + */ + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; + list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + kvm_mod_used_mmu_pages(vcpu->kvm, +1); sp->gfn = gfn; sp->role = role; -- cgit v1.2.3 From 876546436db9775caee4cadf78edd2b5bf72ac84 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:55 -0400 Subject: KVM: x86/mmu: Rename shadow MMU functions that deal with shadow pages Rename 2 functions: kvm_mmu_get_page() -> kvm_mmu_get_shadow_page() kvm_mmu_free_page() -> kvm_mmu_free_shadow_page() This change makes it clear that these functions deal with shadow pages rather than struct pages. It also aligns these functions with the naming scheme for kvm_mmu_find_shadow_page() and kvm_mmu_alloc_shadow_page(). Prefer "shadow_page" over the shorter "sp" since these are core functions and the line lengths aren't terrible. No functional change intended. Reviewed-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-9-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8b84cdd8c6cd..bd45364bf465 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1626,7 +1626,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); @@ -2081,8 +2081,9 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm_vcpu *vcpu, return sp; } -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, - union kvm_mmu_page_role role) +static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + union kvm_mmu_page_role role) { struct hlist_head *sp_list; struct kvm_mmu_page *sp; @@ -2146,7 +2147,7 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role; role = kvm_mmu_child_role(sptep, direct, access); - return kvm_mmu_get_page(vcpu, gfn, role); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); } static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, @@ -2422,7 +2423,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_free_page(sp); + kvm_mmu_free_shadow_page(sp); } } @@ -3415,7 +3416,7 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); - sp = kvm_mmu_get_page(vcpu, gfn, role); + sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); -- cgit v1.2.3 From be911771330a2ae0938d452fd89a8df085533134 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:56 -0400 Subject: KVM: x86/mmu: Move guest PT write-protection to account_shadowed() Move the code that write-protects newly-shadowed guest page tables into account_shadowed(). This avoids a extra gfn-to-memslot lookup and is a more logical place for this code to live. But most importantly, this reduces kvm_mmu_alloc_shadow_page()'s reliance on having a struct kvm_vcpu pointer, which will be necessary when creating new shadow pages during VM ioctls for eager page splitting. Note, it is safe to drop the role.level == PG_LEVEL_4K check since account_shadowed() returns early if role.level > PG_LEVEL_4K. No functional change intended. Reviewed-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-10-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bd45364bf465..2602c3642f23 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -766,6 +766,9 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) KVM_PAGE_TRACK_WRITE); kvm_mmu_gfn_disallow_lpage(slot, gfn); + + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -2072,11 +2075,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); - if (sp_has_gptes(sp)) { + if (sp_has_gptes(sp)) account_shadowed(vcpu->kvm, sp); - if (role.level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); - } return sp; } -- cgit v1.2.3 From 2f8b1b539be37f6bbc4827dfe3320935976fefa4 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:57 -0400 Subject: KVM: x86/mmu: Pass memory caches to allocate SPs separately Refactor kvm_mmu_alloc_shadow_page() to receive the caches from which it will allocate the various pieces of memory for shadow pages as a parameter, rather than deriving them from the vcpu pointer. This will be useful in a future commit where shadow pages are allocated during VM ioctls for eager page splitting, and thus will use a different set of caches. Preemptively pull the caches out all the way to kvm_mmu_get_shadow_page() since eager page splitting will not be calling kvm_mmu_alloc_shadow_page() directly. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-11-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2602c3642f23..fab417e7bf6c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2049,17 +2049,25 @@ out: return sp; } +/* Caches used when allocating a new shadow page. */ +struct shadow_page_caches { + struct kvm_mmu_memory_cache *page_header_cache; + struct kvm_mmu_memory_cache *shadow_page_cache; + struct kvm_mmu_memory_cache *gfn_array_cache; +}; + static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm_vcpu *vcpu, + struct shadow_page_caches *caches, gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; - sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); if (!role.direct) - sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); + sp->gfns = kvm_mmu_memory_cache_alloc(caches->gfn_array_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); @@ -2081,9 +2089,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm_vcpu *vcpu, return sp; } -static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - union kvm_mmu_page_role role) +static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, + struct shadow_page_caches *caches, + gfn_t gfn, + union kvm_mmu_page_role role) { struct hlist_head *sp_list; struct kvm_mmu_page *sp; @@ -2094,13 +2103,26 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, sp = kvm_mmu_find_shadow_page(vcpu, gfn, sp_list, role); if (!sp) { created = true; - sp = kvm_mmu_alloc_shadow_page(vcpu, gfn, sp_list, role); + sp = kvm_mmu_alloc_shadow_page(vcpu, caches, gfn, sp_list, role); } trace_kvm_mmu_get_page(sp, created); return sp; } +static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct shadow_page_caches caches = { + .page_header_cache = &vcpu->arch.mmu_page_header_cache, + .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, + .gfn_array_cache = &vcpu->arch.mmu_gfn_array_cache, + }; + + return __kvm_mmu_get_shadow_page(vcpu, &caches, gfn, role); +} + static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsigned int access) { struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); -- cgit v1.2.3 From 336081fb3f26a3e819838317c011aee41d88e676 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:58 -0400 Subject: KVM: x86/mmu: Replace vcpu with kvm in kvm_mmu_alloc_shadow_page() The vcpu pointer in kvm_mmu_alloc_shadow_page() is only used to get the kvm pointer. So drop the vcpu pointer and just pass in the kvm pointer. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-12-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fab417e7bf6c..c5a88e8d1b53 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2056,7 +2056,7 @@ struct shadow_page_caches { struct kvm_mmu_memory_cache *gfn_array_cache; }; -static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm_vcpu *vcpu, +static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, struct shadow_page_caches *caches, gfn_t gfn, struct hlist_head *sp_list, @@ -2076,15 +2076,15 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm_vcpu *vcpu, * depends on valid pages being added to the head of the list. See * comments in kvm_zap_obsolete_pages(). */ - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); + sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; + list_add(&sp->link, &kvm->arch.active_mmu_pages); + kvm_mod_used_mmu_pages(kvm, +1); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); if (sp_has_gptes(sp)) - account_shadowed(vcpu->kvm, sp); + account_shadowed(kvm, sp); return sp; } @@ -2103,7 +2103,7 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, sp = kvm_mmu_find_shadow_page(vcpu, gfn, sp_list, role); if (!sp) { created = true; - sp = kvm_mmu_alloc_shadow_page(vcpu, caches, gfn, sp_list, role); + sp = kvm_mmu_alloc_shadow_page(vcpu->kvm, caches, gfn, sp_list, role); } trace_kvm_mmu_get_page(sp, created); -- cgit v1.2.3 From 3cc736b35799ab330225d89976fac36794e4bec0 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:26:59 -0400 Subject: KVM: x86/mmu: Pass kvm pointer separately from vcpu to kvm_mmu_find_shadow_page() Get the kvm pointer from the caller, rather than deriving it from vcpu->kvm, and plumb the kvm pointer all the way from kvm_mmu_get_shadow_page(). With this change in place, the vcpu pointer is only needed to sync indirect shadow pages. In other words, __kvm_mmu_get_shadow_page() can now be used to get *direct* shadow pages without a vcpu pointer. This enables eager page splitting, which needs to allocate direct shadow pages during VM ioctls. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-13-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c5a88e8d1b53..88b3f3c2c8b1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1975,7 +1975,8 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sptep_to_sp(spte)); } -static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm_vcpu *vcpu, +static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role) @@ -1985,7 +1986,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm_vcpu *vcpu, int collisions = 0; LIST_HEAD(invalid_list); - for_each_valid_sp(vcpu->kvm, sp, sp_list) { + for_each_valid_sp(kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2002,7 +2003,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm_vcpu *vcpu, * upper-level page will be write-protected. */ if (role.level > PG_LEVEL_4K && sp->unsync) - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; } @@ -2030,7 +2031,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm_vcpu *vcpu, WARN_ON(!list_empty(&invalid_list)); if (ret > 0) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs(kvm); } __clear_sp_write_flooding_count(sp); @@ -2039,13 +2040,13 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm_vcpu *vcpu, } sp = NULL; - ++vcpu->kvm->stat.mmu_cache_miss; + ++kvm->stat.mmu_cache_miss; out: - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) - vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; + if (collisions > kvm->stat.max_mmu_page_hash_collisions) + kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; } @@ -2089,7 +2090,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, return sp; } -static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, +static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, struct shadow_page_caches *caches, gfn_t gfn, union kvm_mmu_page_role role) @@ -2098,12 +2100,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; bool created = false; - sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; - sp = kvm_mmu_find_shadow_page(vcpu, gfn, sp_list, role); + sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); if (!sp) { created = true; - sp = kvm_mmu_alloc_shadow_page(vcpu->kvm, caches, gfn, sp_list, role); + sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); } trace_kvm_mmu_get_page(sp, created); @@ -2120,7 +2122,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, .gfn_array_cache = &vcpu->arch.mmu_gfn_array_cache, }; - return __kvm_mmu_get_shadow_page(vcpu, &caches, gfn, role); + return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); } static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsigned int access) -- cgit v1.2.3 From cbd858b17e379f727c6bf1eaedde7b1a44e75b40 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:27:00 -0400 Subject: KVM: x86/mmu: Allow NULL @vcpu in kvm_mmu_find_shadow_page() Allow @vcpu to be NULL in kvm_mmu_find_shadow_page() (and its only caller __kvm_mmu_get_shadow_page()). @vcpu is only required to sync indirect shadow pages, so it's safe to pass in NULL when looking up direct shadow pages. This will be used for doing eager page splitting, which allocates direct shadow pages from the context of a VM ioctl without access to a vCPU pointer. Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-14-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 88b3f3c2c8b1..a7748c5a2385 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1975,6 +1975,12 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sptep_to_sp(spte)); } +/* + * The vCPU is required when finding indirect shadow pages; the shadow + * page may already exist and syncing it needs the vCPU pointer in + * order to read guest page tables. Direct shadow pages are never + * unsync, thus @vcpu can be NULL if @role.direct is true. + */ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2013,6 +2019,9 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, goto out; if (sp->unsync) { + if (KVM_BUG_ON(!vcpu, kvm)) + break; + /* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) @@ -2090,6 +2099,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, return sp; } +/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu, struct shadow_page_caches *caches, -- cgit v1.2.3 From 6ec6509eea39ee249550a398fd1790b26833675d Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:27:01 -0400 Subject: KVM: x86/mmu: Pass const memslot to rmap_add() Constify rmap_add()'s @slot parameter; it is simply passed on to gfn_to_rmap(), which takes a const memslot. No functional change intended. Reviewed-by: Ben Gardon Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-15-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a7748c5a2385..45a4e85c0b2c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1556,7 +1556,7 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, +static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, u64 *spte, gfn_t gfn) { struct kvm_mmu_page *sp; -- cgit v1.2.3 From 2ff9039a75a8c19fbbcef7fe74ea61a10e4639f3 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:27:02 -0400 Subject: KVM: x86/mmu: Decouple rmap_add() and link_shadow_page() from kvm_vcpu Allow adding new entries to the rmap and linking shadow pages without a struct kvm_vcpu pointer by moving the implementation of rmap_add() and link_shadow_page() into inner helper functions. No functional change intended. Reviewed-by: Ben Gardon Reviewed-by: Peter Xu Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-16-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 47 +++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 45a4e85c0b2c..a8cdbe2958d9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -673,11 +673,6 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } -static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) -{ - return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); -} - static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { kmem_cache_free(pte_list_desc_cache, pte_list_desc); @@ -832,7 +827,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, /* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, +static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; @@ -843,7 +838,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { rmap_printk("%p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_pte_list_desc(vcpu); + desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; @@ -855,7 +850,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; if (!desc->more) { - desc->more = mmu_alloc_pte_list_desc(vcpu); + desc->more = kvm_mmu_memory_cache_alloc(cache); desc = desc->more; desc->spte_count = 0; break; @@ -1556,8 +1551,10 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn) +static void __rmap_add(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; @@ -1566,15 +1563,23 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(vcpu, spte, rmap_head); + rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > RMAP_RECYCLE_THRESHOLD) { - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + kvm_unmap_rmapp(kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address( - vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } } +static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn) +{ + struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; + + __rmap_add(vcpu->kvm, cache, slot, spte, gfn); +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; @@ -1645,13 +1650,13 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, +static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(vcpu, parent_pte, &sp->parent_ptes); + pte_list_add(cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, @@ -2247,8 +2252,8 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, - struct kvm_mmu_page *sp) +static void __link_shadow_page(struct kvm_mmu_memory_cache *cache, u64 *sptep, + struct kvm_mmu_page *sp) { u64 spte; @@ -2258,12 +2263,18 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(vcpu, sp, sptep); + mmu_page_add_parent_pte(cache, sp, sptep); if (sp->unsync_children || sp->unsync) mark_unsync(sptep); } +static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) +{ + __link_shadow_page(&vcpu->arch.mmu_pte_list_desc_cache, sptep, sp); +} + static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { -- cgit v1.2.3 From 81cb4657e9f0d931216fc22966ddf68a5151a206 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:27:03 -0400 Subject: KVM: x86/mmu: Update page stats in __rmap_add() Update the page stats in __rmap_add() rather than at the call site. This will avoid having to manually update page stats when splitting huge pages in a subsequent commit. No functional change intended. Reviewed-by: Ben Gardon Reviewed-by: Peter Xu Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-17-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a8cdbe2958d9..7cca28d89a85 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1562,6 +1562,8 @@ static void __rmap_add(struct kvm *kvm, sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); + kvm_update_page_stats(kvm, sp->role.level, 1); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); rmap_count = pte_list_add(cache, spte, rmap_head); @@ -2783,7 +2785,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); - kvm_update_page_stats(vcpu->kvm, level, 1); rmap_add(vcpu, slot, sptep, gfn); } -- cgit v1.2.3 From 6a97575d5cffb71aa9a95d33f0ca03c8a4bb3b2b Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:27:04 -0400 Subject: KVM: x86/mmu: Cache the access bits of shadowed translations Splitting huge pages requires allocating/finding shadow pages to replace the huge page. Shadow pages are keyed, in part, off the guest access permissions they are shadowing. For fully direct MMUs, there is no shadowing so the access bits in the shadow page role are always ACC_ALL. But during shadow paging, the guest can enforce whatever access permissions it wants. In particular, eager page splitting needs to know the permissions to use for the subpages, but KVM cannot retrieve them from the guest page tables because eager page splitting does not have a vCPU. Fortunately, the guest access permissions are easy to cache whenever page faults or FNAME(sync_page) update the shadow page tables; this is an extension of the existing cache of the shadowed GFNs in the gfns array of the shadow page. The access bits only take up 3 bits, which leaves 61 bits left over for gfns, which is more than enough. Now that the gfns array caches more information than just GFNs, rename it to shadowed_translation. While here, preemptively fix up the WARN_ON() that detects gfn mismatches in direct SPs. The WARN_ON() was paired with a pr_err_ratelimited(), which means that users could sometimes see the WARN without the accompanying error message. Fix this by outputting the error message as part of the WARN splat, and opportunistically make them WARN_ONCE() because if these ever fire, they are all but guaranteed to fire a lot and will bring down the kernel. Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-18-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 85 +++++++++++++++++++++++++++++------------ arch/x86/kvm/mmu/mmu_internal.h | 17 ++++++++- arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++- 4 files changed, 84 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7e4c31b57a75..64efe8c90c31 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -713,7 +713,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_shadow_page_cache; - struct kvm_mmu_memory_cache mmu_gfn_array_cache; + struct kvm_mmu_memory_cache mmu_shadowed_info_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7cca28d89a85..13a059ad5dc7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -656,7 +656,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) if (r) return r; if (maybe_indirect) { - r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; @@ -669,7 +669,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } @@ -678,34 +678,68 @@ static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) kmem_cache_free(pte_list_desc_cache, pte_list_desc); } +static bool sp_has_gptes(struct kvm_mmu_page *sp); + static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { if (sp->role.passthrough) return sp->gfn; if (!sp->role.direct) - return sp->gfns[index]; + return sp->shadowed_translation[index] >> PAGE_SHIFT; return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } -static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +/* + * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note + * that the SPTE itself may have a more constrained access permissions that + * what the guest enforces. For example, a guest may create an executable + * huge PTE but KVM may disallow execution to mitigate iTLB multihit. + */ +static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp->role.passthrough) { - WARN_ON_ONCE(gfn != sp->gfn); - return; - } + if (sp_has_gptes(sp)) + return sp->shadowed_translation[index] & ACC_ALL; - if (!sp->role.direct) { - sp->gfns[index] = gfn; + /* + * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, + * KVM is not shadowing any guest page tables, so the "guest access + * permissions" are just ACC_ALL. + * + * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM + * is shadowing a guest huge page with small pages, the guest access + * permissions being shadowed are the access permissions of the huge + * page. + * + * In both cases, sp->role.access contains the correct access bits. + */ + return sp->role.access; +} + +static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, u32 access) +{ + if (sp_has_gptes(sp)) { + sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } - if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) - pr_err_ratelimited("gfn mismatch under direct page %llx " - "(expected %llx, got %llx)\n", - sp->gfn, - kvm_mmu_page_get_gfn(sp, index), gfn); + WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), + "access mismatch under %s page %llx (expected %u, got %u)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_access(sp, index), access); + + WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), + "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); +} + +static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, u32 access) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); + + kvm_mmu_page_set_translation(sp, index, gfn, access); } /* @@ -1554,14 +1588,14 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, static void __rmap_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, const struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn) + u64 *spte, gfn_t gfn, u32 access) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; int rmap_count; sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); + kvm_mmu_page_set_translation(sp, spte - sp->spt, gfn, access); kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); @@ -1575,11 +1609,11 @@ static void __rmap_add(struct kvm *kvm, } static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn) + u64 *spte, gfn_t gfn, u32 access) { struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; - __rmap_add(vcpu->kvm, cache, slot, spte, gfn); + __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1643,7 +1677,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) list_del(&sp->link); free_page((unsigned long)sp->spt); if (!sp->role.direct) - free_page((unsigned long)sp->gfns); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -2070,7 +2104,7 @@ out: struct shadow_page_caches { struct kvm_mmu_memory_cache *page_header_cache; struct kvm_mmu_memory_cache *shadow_page_cache; - struct kvm_mmu_memory_cache *gfn_array_cache; + struct kvm_mmu_memory_cache *shadowed_info_cache; }; static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, @@ -2084,7 +2118,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); if (!role.direct) - sp->gfns = kvm_mmu_memory_cache_alloc(caches->gfn_array_cache); + sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); @@ -2136,7 +2170,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, struct shadow_page_caches caches = { .page_header_cache = &vcpu->arch.mmu_page_header_cache, .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, - .gfn_array_cache = &vcpu->arch.mmu_gfn_array_cache, + .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, }; return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); @@ -2785,7 +2819,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); - rmap_add(vcpu, slot, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn, pte_access); + } else { + /* Already rmapped but the pte_access bits may have changed. */ + kvm_mmu_page_set_access(sp, sptep - sp->spt, pte_access); } return ret; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bb9d12ac0db3..ae2d660e2dab 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -67,8 +67,21 @@ struct kvm_mmu_page { gfn_t gfn; u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; + + /* + * Stores the result of the guest translation being shadowed by each + * SPTE. KVM shadows two types of guest translations: nGPA -> GPA + * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both + * cases the result of the translation is a GPA and a set of access + * constraints. + * + * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed + * access permissions are stored in the lower bits. Note, for + * convenience and uniformity across guests, the access permissions are + * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. + */ + u64 *shadowed_translation; + /* Currently serving as active root */ union { int root_count; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 6ecdd7a41a82..24f292f3f93f 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -985,7 +985,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } /* - * Using the cached information from sp->gfns is safe because: + * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is + * safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. * @@ -1067,12 +1068,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) * "present" bit, as all other paging modes will create a * read-only SPTE if pte_access is zero. */ - if ((!pte_access && !shadow_present_mask) || gfn != sp->gfns[i]) { + if ((!pte_access && !shadow_present_mask) || + gfn != kvm_mmu_page_get_gfn(sp, i)) { drop_spte(vcpu->kvm, &sp->spt[i]); flush = true; continue; } + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); + sptep = &sp->spt[i]; spte = *sptep; host_writable = spte & shadow_host_writable_mask; -- cgit v1.2.3 From 47855da0555a46eb4f98f5e5cae98525daf83ff9 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:27:05 -0400 Subject: KVM: x86/mmu: Extend make_huge_page_split_spte() for the shadow MMU Currently make_huge_page_split_spte() assumes execute permissions can be granted to any 4K SPTE when splitting huge pages. This is true for the TDP MMU but is not necessarily true for the shadow MMU, since KVM may be shadowing a non-executable huge page. To fix this, pass in the role of the child shadow page where the huge page will be split and derive the execution permission from that. This is correct because huge pages are always split with direct shadow page and thus the shadow page role contains the correct access permissions. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-19-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/spte.c | 15 +++++++-------- arch/x86/kvm/mmu/spte.h | 4 ++-- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index db294c1beea2..fb1f17504138 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -246,11 +246,10 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level, +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, int index) { u64 child_spte; - int child_level; if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) return 0; @@ -259,23 +258,23 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level, return 0; child_spte = huge_spte; - child_level = huge_level - 1; /* * The child_spte already has the base address of the huge page being * split. So we just have to OR in the offset to the page at the next * lower level for the given index. */ - child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; + child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT; - if (child_level == PG_LEVEL_4K) { + if (role.level == PG_LEVEL_4K) { child_spte &= ~PT_PAGE_SIZE_MASK; /* - * When splitting to a 4K page, mark the page executable as the - * NX hugepage mitigation no longer applies. + * When splitting to a 4K page where execution is allowed, mark + * the page executable as the NX hugepage mitigation no longer + * applies. */ - if (is_nx_huge_page_enabled(kvm)) + if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 256f90587e8d..b5c855f5514f 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -421,8 +421,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level, - int index); +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 522e2532343b..f3a430d64975 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1478,7 +1478,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * not been linked in yet and thus is not reachable from any other CPU. */ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, level, i); + sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level -- cgit v1.2.3 From 20d49186c0302015c229e23c7e63855cbacc8032 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:27:06 -0400 Subject: KVM: x86/mmu: Zap collapsible SPTEs in shadow MMU at all possible levels Currently KVM only zaps collapsible 4KiB SPTEs in the shadow MMU. This is fine for now since KVM never creates intermediate huge pages during dirty logging. In other words, KVM always replaces 1GiB pages directly with 4KiB pages, so there is no reason to look for collapsible 2MiB pages. However, this will stop being true once the shadow MMU participates in eager page splitting. During eager page splitting, each 1GiB is first split into 2MiB pages and then those are split into 4KiB pages. The intermediate 2MiB pages may be left behind if an error condition causes eager page splitting to bail early. No functional change intended. Reviewed-by: Peter Xu Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-20-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 13a059ad5dc7..7fdbb9060e9d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6154,18 +6154,24 @@ restart: return need_tlb_flush; } +static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + /* + * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap + * pages that are already mapped at the maximum hugepage level. + */ + if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); +} + void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - /* - * Zap only 4k SPTEs since the legacy MMU only supports dirty - * logging at a 4k granularity and never creates collapsible - * 2m SPTEs during dirty logging. - */ - if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_rmap_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } -- cgit v1.2.3 From 0cd8dc739833080aa0813cbd94d907a93e3a14c3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 22 Jun 2022 15:27:07 -0400 Subject: KVM: x86/mmu: pull call to drop_large_spte() into __link_shadow_page() Before allocating a child shadow page table, all callers check whether the parent already points to a huge page and, if so, they drop that SPTE. This is done by drop_large_spte(). However, dropping the large SPTE is really only necessary before the sp is installed. While the sp is returned by kvm_mmu_get_child_sp(), installing it happens later in __link_shadow_page(). Move the call there instead of having it in each and every caller. To ensure that the shadow page is not linked twice if it was present, do _not_ opportunistically make kvm_mmu_get_child_sp() idempotent: instead, return an error value if the shadow page already existed. This is a bit more verbose, but clearer than NULL. Finally, now that the drop_large_spte() name is not taken anymore, remove the two underscores in front of __drop_large_spte(). Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 43 +++++++++++++++++++++--------------------- arch/x86/kvm/mmu/paging_tmpl.h | 31 ++++++++++++++---------------- 2 files changed, 35 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7fdbb9060e9d..192cb7dc4471 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1135,26 +1135,16 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } - -static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +static void drop_large_spte(struct kvm *kvm, u64 *sptep) { - if (is_large_pte(*sptep)) { - WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); - drop_spte(kvm, sptep); - return true; - } - - return false; -} + struct kvm_mmu_page *sp; -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = sptep_to_sp(sptep); + sp = sptep_to_sp(sptep); + WARN_ON(sp->role.level == PG_LEVEL_4K); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + drop_spte(kvm, sptep); + kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); - } } /* @@ -2221,6 +2211,9 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, { union kvm_mmu_page_role role; + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return ERR_PTR(-EEXIST); + role = kvm_mmu_child_role(sptep, direct, access); return kvm_mmu_get_shadow_page(vcpu, gfn, role); } @@ -2288,13 +2281,21 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } -static void __link_shadow_page(struct kvm_mmu_memory_cache *cache, u64 *sptep, +static void __link_shadow_page(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, u64 *sptep, struct kvm_mmu_page *sp) { u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + /* + * If an SPTE is present already, it must be a leaf and therefore + * a large one. Drop it and flush the TLB before installing sp. + */ + if (is_shadow_present_pte(*sptep)) + drop_large_spte(kvm, sptep); + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); @@ -2308,7 +2309,7 @@ static void __link_shadow_page(struct kvm_mmu_memory_cache *cache, u64 *sptep, static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp) { - __link_shadow_page(&vcpu->arch.mmu_pte_list_desc_cache, sptep, sp); + __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp); } static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -3080,11 +3081,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (it.level == fault->goal_level) break; - drop_large_spte(vcpu, it.sptep); - if (is_shadow_present_pte(*it.sptep)) - continue; - sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); + if (sp == ERR_PTR(-EEXIST)) + continue; link_shadow_page(vcpu, it.sptep, sp); if (fault->is_tdp && fault->huge_page_disallowed && diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 24f292f3f93f..2448fa8d8438 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -648,15 +648,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); - drop_large_spte(vcpu, it.sptep); - sp = NULL; - if (!is_shadow_present_pte(*it.sptep)) { - table_gfn = gw->table_gfn[it.level - 2]; - access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, - false, access); + table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, + false, access); + if (sp != ERR_PTR(-EEXIST)) { /* * We must synchronize the pagetable before linking it * because the guest doesn't need to flush tlb when @@ -685,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) goto out_gpte_changed; - if (sp) + if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); } @@ -709,16 +707,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, validate_direct_spte(vcpu, it.sptep, direct_access); - drop_large_spte(vcpu, it.sptep); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, + true, direct_access); + if (sp == ERR_PTR(-EEXIST)) + continue; - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, - true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); - if (fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + link_shadow_page(vcpu, it.sptep, sp); + if (fault->huge_page_disallowed && + fault->req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } if (WARN_ON_ONCE(it.level != fault->goal_level)) -- cgit v1.2.3 From ada51a9de7375ef8933fcaa1af9cb61a7fe0ceef Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Jun 2022 15:27:09 -0400 Subject: KVM: x86/mmu: Extend Eager Page Splitting to nested MMUs Add support for Eager Page Splitting pages that are mapped by nested MMUs. Walk through the rmap first splitting all 1GiB pages to 2MiB pages, and then splitting all 2MiB pages to 4KiB pages. Note, Eager Page Splitting is limited to nested MMUs as a policy rather than due to any technical reason (the sp->role.guest_mode check could just be deleted and Eager Page Splitting would work correctly for all shadow MMU pages). There is really no reason to support Eager Page Splitting for tdp_mmu=N, since such support will eventually be phased out, and there is no current use case supporting Eager Page Splitting on hosts where TDP is either disabled or unavailable in hardware. Furthermore, future improvements to nested MMU scalability may diverge the code from the legacy shadow paging implementation. These improvements will be simpler to make if Eager Page Splitting does not have to worry about legacy shadow paging. Splitting huge pages mapped by nested MMUs requires dealing with some extra complexity beyond that of the TDP MMU: (1) The shadow MMU has a limit on the number of shadow pages that are allowed to be allocated. So, as a policy, Eager Page Splitting refuses to split if there are KVM_MIN_FREE_MMU_PAGES or fewer pages available. (2) Splitting a huge page may end up re-using an existing lower level shadow page tables. This is unlike the TDP MMU which always allocates new shadow page tables when splitting. (3) When installing the lower level SPTEs, they must be added to the rmap which may require allocating additional pte_list_desc structs. Case (2) is especially interesting since it may require a TLB flush, unlike the TDP MMU which can fully split huge pages without any TLB flushes. Specifically, an existing lower level page table may point to even lower level page tables that are not fully populated, effectively unmapping a portion of the huge page, which requires a flush. As of this commit, a flush is always done always after dropping the huge page and before installing the lower level page table. This TLB flush could instead be delayed until the MMU lock is about to be dropped, which would batch flushes for multiple splits. However these flushes should be rare in practice (a huge page must be aliased in multiple SPTEs and have been split for NX Huge Pages in only some of them). Flushing immediately is simpler to plumb and also reduces the chances of tripping over a CPU bug (e.g. see iTLB multihit). [ This commit is based off of the original implementation of Eager Page Splitting from Peter in Google's kernel from 2016. ] Suggested-by: Peter Feiner Signed-off-by: David Matlack Message-Id: <20220516232138.1783324-23-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- Documentation/admin-guide/kernel-parameters.txt | 3 +- arch/x86/include/asm/kvm_host.h | 22 ++ arch/x86/kvm/mmu/mmu.c | 259 +++++++++++++++++++++++- 3 files changed, 275 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 97c16aa2f53f..329f0f274e2b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2418,8 +2418,7 @@ the KVM_CLEAR_DIRTY ioctl, and only for the pages being cleared. - Eager page splitting currently only supports splitting - huge pages mapped by the TDP MMU. + Eager page splitting is only supported when kvm.tdp_mmu=Y. Default is Y (on). diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 64efe8c90c31..665667d61caf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1338,6 +1338,28 @@ struct kvm_arch { u32 max_vcpu_ids; bool disable_nx_huge_pages; + + /* + * Memory caches used to allocate shadow pages when performing eager + * page splitting. No need for a shadowed_info_cache since eager page + * splitting only allocates direct shadow pages. + * + * Protected by kvm->slots_lock. + */ + struct kvm_mmu_memory_cache split_shadow_page_cache; + struct kvm_mmu_memory_cache split_page_header_cache; + + /* + * Memory cache used to allocate pte_list_desc structs while splitting + * huge pages. In the worst case, to split one huge page, 512 + * pte_list_desc structs are needed to add each lower level leaf sptep + * to the rmap plus 1 to extend the parent_ptes rmap of the lower level + * page table. + * + * Protected by kvm->slots_lock. + */ +#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1) + struct kvm_mmu_memory_cache split_desc_cache; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 192cb7dc4471..9bfe339bf67f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5942,9 +5942,25 @@ int kvm_mmu_init_vm(struct kvm *kvm) node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); + + kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; + kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; + kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; + return 0; } +static void mmu_free_vm_memory_caches(struct kvm *kvm) +{ + kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); +} + void kvm_mmu_uninit_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; @@ -5952,6 +5968,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); kvm_mmu_uninit_tdp_mmu(kvm); + + mmu_free_vm_memory_caches(kvm); } static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) @@ -6073,15 +6091,235 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } +static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) +{ + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static bool need_topup_split_caches_or_resched(struct kvm *kvm) +{ + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; + + /* + * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed + * to split a single huge page. Calculating how many are actually needed + * is possible but not worth the complexity. + */ + return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || + need_topup(&kvm->arch.split_page_header_cache, 1) || + need_topup(&kvm->arch.split_shadow_page_cache, 1); +} + +static int topup_split_caches(struct kvm *kvm) +{ + int r; + + lockdep_assert_held(&kvm->slots_lock); + + /* + * Setting capacity == min would cause KVM to drop mmu_lock even if + * just one object was consumed from the cache, so make capacity + * larger than min. + */ + r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, + 2 * SPLIT_DESC_CACHE_MIN_NR_OBJECTS, + SPLIT_DESC_CACHE_MIN_NR_OBJECTS); + if (r) + return r; + + r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); + if (r) + return r; + + return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); +} + +static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + struct shadow_page_caches caches = {}; + union kvm_mmu_page_role role; + unsigned int access; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(huge_sp, huge_sptep - huge_sp->spt); + access = kvm_mmu_page_get_access(huge_sp, huge_sptep - huge_sp->spt); + + /* + * Note, huge page splitting always uses direct shadow pages, regardless + * of whether the huge page itself is mapped by a direct or indirect + * shadow page, since the huge page region itself is being directly + * mapped with smaller pages. + */ + role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); + + /* Direct SPs do not require a shadowed_info_cache. */ + caches.page_header_cache = &kvm->arch.split_page_header_cache; + caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; + + /* Safe to pass NULL for vCPU since requesting a direct SP. */ + return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); +} + +static void shadow_mmu_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) + +{ + struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; + u64 huge_spte = READ_ONCE(*huge_sptep); + struct kvm_mmu_page *sp; + u64 *sptep, spte; + gfn_t gfn; + int index; + + sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); + + for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { + sptep = &sp->spt[index]; + gfn = kvm_mmu_page_get_gfn(sp, index); + + /* + * The SP may already have populated SPTEs, e.g. if this huge + * page is aliased by multiple sptes with the same access + * permissions. These entries are guaranteed to map the same + * gfn-to-pfn translation since the SP is direct, so no need to + * modify them. + * + * If a given SPTE points to a lower level page table, installing + * such SPTEs would effectively unmap a potion of the huge page. + * This is not an issue because __link_shadow_page() flushes the TLB + * when the passed sp replaces a large SPTE. + */ + if (is_shadow_present_pte(*sptep)) + continue; + + spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); + mmu_spte_set(sptep, spte); + __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); + } + + __link_shadow_page(kvm, cache, huge_sptep, sp); +} + +static int shadow_mmu_try_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + int level, r = 0; + gfn_t gfn; + u64 spte; + + /* Grab information for the tracepoint before dropping the MMU lock. */ + gfn = kvm_mmu_page_get_gfn(huge_sp, huge_sptep - huge_sp->spt); + level = huge_sp->role.level; + spte = *huge_sptep; + + if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { + r = -ENOSPC; + goto out; + } + + if (need_topup_split_caches_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* + * If the topup succeeds, return -EAGAIN to indicate that the + * rmap iterator should be restarted because the MMU lock was + * dropped. + */ + r = topup_split_caches(kvm) ?: -EAGAIN; + write_lock(&kvm->mmu_lock); + goto out; + } + + shadow_mmu_split_huge_page(kvm, slot, huge_sptep); + +out: + trace_kvm_mmu_split_huge_page(gfn, spte, level, r); + return r; +} + +static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) +{ + struct rmap_iterator iter; + struct kvm_mmu_page *sp; + u64 *huge_sptep; + int r; + +restart: + for_each_rmap_spte(rmap_head, &iter, huge_sptep) { + sp = sptep_to_sp(huge_sptep); + + /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ + if (WARN_ON_ONCE(!sp->role.guest_mode)) + continue; + + /* The rmaps should never contain non-leaf SPTEs. */ + if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) + continue; + + /* SPs with level >PG_LEVEL_4K should never by unsync. */ + if (WARN_ON_ONCE(sp->unsync)) + continue; + + /* Don't bother splitting huge pages on invalid SPs. */ + if (sp->role.invalid) + continue; + + r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); + + /* + * The split succeeded or needs to be retried because the MMU + * lock was dropped. Either way, restart the iterator to get it + * back into a consistent state. + */ + if (!r || r == -EAGAIN) + goto restart; + + /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ + break; + } + + return false; +} + +static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level) +{ + int level; + + /* + * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working + * down to the target level. This ensures pages are recursively split + * all the way to the target level. There's no need to split pages + * already at the target level. + */ + for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) { + slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages, + level, level, start, end - 1, true, false); + } +} + /* Must be called with the mmu_lock held in write-mode. */ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level) { - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, - target_level, false); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* * A TLB flush is unnecessary at this point for the same resons as in @@ -6096,12 +6334,19 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (is_tdp_mmu_enabled(kvm)) { - read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); - read_unlock(&kvm->mmu_lock); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + write_unlock(&kvm->mmu_lock); } + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); + read_unlock(&kvm->mmu_lock); + /* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to -- cgit v1.2.3 From 951ceb94ede39acbebf34481ec87fe2c46b91e0f Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:27 -0700 Subject: KVM: x86: Make APIC_VERSION capture only the magic 0x14UL. Refactor APIC_VERSION so that the maximum number of LVT entries is inserted at runtime rather than compile time. This will be used in a subsequent commit to expose the LVT CMCI Register to VMs that support Corrected Machine Check error counting/signaling (IA32_MCG_CAP.MCG_CMCI_P=1). Suggested-by: Sean Christopherson Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-2-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 43c42a580295..828af02be350 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -54,7 +54,7 @@ #define PRIo64 "o" /* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) +#define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define MAX_APIC_VECTOR 256 @@ -402,7 +402,7 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 v = APIC_VERSION; + u32 v = APIC_VERSION | ((KVM_APIC_LVT_NUM - 1) << 16); if (!lapic_in_kernel(vcpu)) return; -- cgit v1.2.3 From 1d8c681fb6ed459128ab9d5e36adb7ec06a26aea Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:28 -0700 Subject: KVM: x86: Fill apic_lvt_mask with enums / explicit entries. This patch defines a lapic_lvt_entry enum used as explicit indices to the apic_lvt_mask array. In later patches a LVT_CMCI will be added to implement the Corrected Machine Check Interrupt signaling. Suggested-by: Sean Christopherson Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-3-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 19 ++++++++++--------- arch/x86/kvm/lapic.h | 12 +++++++++++- 2 files changed, 21 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 828af02be350..fe295c718fd9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -402,7 +402,7 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 v = APIC_VERSION | ((KVM_APIC_LVT_NUM - 1) << 16); + u32 v = APIC_VERSION | ((KVM_APIC_MAX_NR_LVT_ENTRIES - 1) << 16); if (!lapic_in_kernel(vcpu)) return; @@ -420,12 +420,13 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) kvm_lapic_set_reg(apic, APIC_LVR, v); } -static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { - LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ - LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ - LVT_MASK | APIC_MODE_MASK, /* LVTPC */ - LINT_MASK, LINT_MASK, /* LVT0-1 */ - LVT_MASK /* LVTERR */ +static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { + [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */ + [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK, + [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK, + [LVT_LINT0] = LINT_MASK, + [LVT_LINT1] = LINT_MASK, + [LVT_ERROR] = LVT_MASK }; static int find_highest_vector(void *bitmap) @@ -2091,7 +2092,7 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) int i; u32 lvt_val; - for (i = 0; i < KVM_APIC_LVT_NUM; i++) { + for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) { lvt_val = kvm_lapic_get_reg(apic, APIC_LVTT + 0x10 * i); kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, @@ -2409,7 +2410,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); - for (i = 0; i < KVM_APIC_LVT_NUM; i++) + for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) && diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6207b64b1281..af26777fb0ea 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -10,7 +10,6 @@ #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 -#define KVM_APIC_LVT_NUM 6 #define APIC_SHORT_MASK 0xc0000 #define APIC_DEST_NOSHORT 0x0 @@ -29,6 +28,17 @@ enum lapic_mode { LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE, }; +enum lapic_lvt_entry { + LVT_TIMER, + LVT_THERMAL_MONITOR, + LVT_PERFORMANCE_COUNTER, + LVT_LINT0, + LVT_LINT1, + LVT_ERROR, + + KVM_APIC_MAX_NR_LVT_ENTRIES, +}; + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ -- cgit v1.2.3 From 0378739401cfc2fd2a50a2a337a4f394e8493008 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 22 Jun 2022 15:27:10 -0400 Subject: KVM: x86/mmu: Avoid unnecessary flush on eager page split The TLB flush before installing the newly-populated lower level page table is unnecessary if the lower-level page table maps the huge page identically. KVM knows it is if it did not reuse an existing shadow page table, tell drop_large_spte() to skip the flush in that case. Extracted from a patch by David Matlack. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9bfe339bf67f..bd74a287b54a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1135,7 +1135,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static void drop_large_spte(struct kvm *kvm, u64 *sptep) +static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) { struct kvm_mmu_page *sp; @@ -1143,7 +1143,9 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep) WARN_ON(sp->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); - kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, + + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } @@ -2283,7 +2285,7 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) static void __link_shadow_page(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, u64 *sptep, - struct kvm_mmu_page *sp) + struct kvm_mmu_page *sp, bool flush) { u64 spte; @@ -2291,10 +2293,11 @@ static void __link_shadow_page(struct kvm *kvm, /* * If an SPTE is present already, it must be a leaf and therefore - * a large one. Drop it and flush the TLB before installing sp. + * a large one. Drop it, and flush the TLB if needed, before + * installing sp. */ if (is_shadow_present_pte(*sptep)) - drop_large_spte(kvm, sptep); + drop_large_spte(kvm, sptep, flush); spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); @@ -2309,7 +2312,7 @@ static void __link_shadow_page(struct kvm *kvm, static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp) { - __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp); + __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); } static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -6170,6 +6173,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm, struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; u64 huge_spte = READ_ONCE(*huge_sptep); struct kvm_mmu_page *sp; + bool flush = false; u64 *sptep, spte; gfn_t gfn; int index; @@ -6187,20 +6191,24 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm, * gfn-to-pfn translation since the SP is direct, so no need to * modify them. * - * If a given SPTE points to a lower level page table, installing - * such SPTEs would effectively unmap a potion of the huge page. - * This is not an issue because __link_shadow_page() flushes the TLB - * when the passed sp replaces a large SPTE. + * However, if a given SPTE points to a lower level page table, + * that lower level page table may only be partially populated. + * Installing such SPTEs would effectively unmap a potion of the + * huge page. Unmapping guest memory always requires a TLB flush + * since a subsequent operation on the unmapped regions would + * fail to detect the need to flush. */ - if (is_shadow_present_pte(*sptep)) + if (is_shadow_present_pte(*sptep)) { + flush |= !is_last_spte(*sptep, sp->role.level); continue; + } spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); mmu_spte_set(sptep, spte); __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); } - __link_shadow_page(kvm, cache, huge_sptep, sp); + __link_shadow_page(kvm, cache, huge_sptep, sp, flush); } static int shadow_mmu_try_split_huge_page(struct kvm *kvm, -- cgit v1.2.3 From 987f625e0799c9666ce0a0e18c2d0c001db96fad Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:29 -0700 Subject: KVM: x86: Add APIC_LVTx() macro. An APIC_LVTx macro is introduced to calcualte the APIC_LVTx register offset based on the index in the lapic_lvt_entry enum. Later patches will extend the APIC_LVTx macro to support the APIC_LVTCMCI register in order to implement Corrected Machine Check Interrupt signaling. Suggested-by: Sean Christopherson Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-4-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 7 +++---- arch/x86/kvm/lapic.h | 2 ++ 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fe295c718fd9..30eb9b7fbe71 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2093,9 +2093,8 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) u32 lvt_val; for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) { - lvt_val = kvm_lapic_get_reg(apic, - APIC_LVTT + 0x10 * i); - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, + lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i)); + kvm_lapic_set_reg(apic, APIC_LVTx(i), lvt_val | APIC_LVT_MASKED); } apic_update_lvtt(apic); @@ -2411,7 +2410,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) && kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index af26777fb0ea..08843c21084d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -39,6 +39,8 @@ enum lapic_lvt_entry { KVM_APIC_MAX_NR_LVT_ENTRIES, }; +#define APIC_LVTx(x) (APIC_LVTT + 0x10 * (x)) + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ -- cgit v1.2.3 From 4b903561ec499eef233bf690076cd71b1f8604cf Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:30 -0700 Subject: KVM: x86: Add Corrected Machine Check Interrupt (CMCI) emulation to lapic. This patch calculates the number of lvt entries as part of KVM_X86_MCE_SETUP conditioned on the presence of MCG_CMCI_P bit in MCG_CAP and stores result in kvm_lapic. It translats from APIC_LVTx register to index in lapic_lvt_entry enum. It extends the APIC_LVTx macro as well as other lapic write/reset handling etc to support Corrected Machine Check Interrupt. Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-5-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 49 ++++++++++++++++++++++++++++++++++--------------- arch/x86/kvm/lapic.h | 4 +++- arch/x86/kvm/x86.c | 2 ++ 3 files changed, 39 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 30eb9b7fbe71..f03facc2ee3e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -399,14 +400,21 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index) +{ + return apic->nr_lvt_entries > lvt_index; +} + void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 v = APIC_VERSION | ((KVM_APIC_MAX_NR_LVT_ENTRIES - 1) << 16); + u32 v = 0; if (!lapic_in_kernel(vcpu)) return; + v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); + /* * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) * which doesn't have EOI register; Some buggy OSes (e.g. Windows with @@ -426,7 +434,8 @@ static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK, [LVT_LINT0] = LINT_MASK, [LVT_LINT1] = LINT_MASK, - [LVT_ERROR] = LVT_MASK + [LVT_ERROR] = LVT_MASK, + [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK }; static int find_highest_vector(void *bitmap) @@ -1436,6 +1445,9 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, APIC_REG_MASK(APIC_TMCCT) | APIC_REG_MASK(APIC_TDCR); + if (kvm_lapic_lvt_supported(apic, LVT_CMCI)) + valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); + /* * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be @@ -2044,6 +2056,16 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); } +static int get_lvt_index(u32 reg) +{ + if (reg == APIC_LVTCMCI) + return LVT_CMCI; + if (reg < APIC_LVTT || reg > APIC_LVTERR) + return -1; + return array_index_nospec( + (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES); +} + static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; @@ -2090,12 +2112,10 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; - u32 lvt_val; - for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) { - lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i)); + for (i = 0; i < apic->nr_lvt_entries; i++) { kvm_lapic_set_reg(apic, APIC_LVTx(i), - lvt_val | APIC_LVT_MASKED); + kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED); } apic_update_lvtt(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -2124,16 +2144,15 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: - case APIC_LVTERR: { - /* TODO: Check vector */ - size_t size; - u32 index; - + case APIC_LVTERR: + case APIC_LVTCMCI: { + u32 index = get_lvt_index(reg); + if (!kvm_lapic_lvt_supported(apic, index)) { + ret = 1; + break; + } if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - size = ARRAY_SIZE(apic_lvt_mask); - index = array_index_nospec( - (reg - APIC_LVTT) >> 4, size); val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); break; @@ -2409,7 +2428,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); - for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) + for (i = 0; i < apic->nr_lvt_entries; i++) kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) && diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 08843c21084d..762bf6163798 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -35,11 +35,12 @@ enum lapic_lvt_entry { LVT_LINT0, LVT_LINT1, LVT_ERROR, + LVT_CMCI, KVM_APIC_MAX_NR_LVT_ENTRIES, }; -#define APIC_LVTx(x) (APIC_LVTT + 0x10 * (x)) +#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x)) struct kvm_timer { struct hrtimer timer; @@ -78,6 +79,7 @@ struct kvm_lapic { struct gfn_to_hva_cache vapic_cache; unsigned long pending_events; unsigned int sipi_vector; + int nr_lvt_entries; }; struct dest_map; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ce0c6fe166d..6d6d6c6413ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4845,6 +4845,8 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCi_CTL to all 1s */ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; + vcpu->arch.apic->nr_lvt_entries = + KVM_APIC_MAX_NR_LVT_ENTRIES - !(mcg_cap & MCG_CMCI_P); static_call(kvm_x86_setup_mce)(vcpu); out: -- cgit v1.2.3 From 087acc4e1847993a66f43128e40be0ab2244c98f Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:31 -0700 Subject: KVM: x86: Use kcalloc to allocate the mce_banks array. This patch updates the allocation of mce_banks with the array allocation API (kcalloc) as a precedent for the later mci_ctl2_banks to implement per-bank control of Corrected Machine Check Interrupt (CMCI). Suggested-by: Sean Christopherson Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-6-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d6d6c6413ba..3c6eb6837b96 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11447,7 +11447,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks) goto fail_free_pio_data; -- cgit v1.2.3 From 281b52780b57821abc434c0413107d3075881a1f Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:32 -0700 Subject: KVM: x86: Add emulation for MSR_IA32_MCx_CTL2 MSRs. This patch adds the emulation of IA32_MCi_CTL2 registers to KVM. A separate mci_ctl2_banks array is used to keep the existing mce_banks register layout intact. In Machine Check Architecture, in addition to MCG_CMCI_P, bit 30 of the per-bank register IA32_MCi_CTL2 controls whether Corrected Machine Check error reporting is enabled. Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-7-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 130 ++++++++++++++++++++++++++++------------ 2 files changed, 92 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 665667d61caf..88a3026ee163 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -826,6 +826,7 @@ struct kvm_vcpu_arch { u64 mcg_ctl; u64 mcg_ext_ctl; u64 *mce_banks; + u64 *mci_ctl2_banks; /* Cache MMIO info */ u64 mmio_gva; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3c6eb6837b96..f743e4ef2009 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3191,6 +3191,16 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ @@ -3209,6 +3219,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3222,35 +3233,53 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest. + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; + + if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) + return 1; + /* An attempt to write a 1 to a reserved bit raises #GP */ + if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + vcpu->arch.mci_ctl2_banks[offset] = data; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. * * UNIXWARE clears bit 0 of MC1_CTL to ignore * correctable, single-bit ECC data errors. - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10) | 1) != ~(u64)0) - return -1; - - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return -1; - } + */ + if (is_mci_control_msr(msr) && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; - vcpu->arch.mce_banks[offset] = data; - break; - } + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; @@ -3534,7 +3563,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -3704,6 +3734,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: @@ -3819,6 +3850,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3836,16 +3868,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && !host) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + data = vcpu->arch.mci_ctl2_banks[offset]; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; @@ -3949,7 +3992,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_MTRRcap: - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -4065,6 +4109,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: @@ -4842,9 +4887,12 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCG_CTL to all 1s */ if (mcg_cap & MCG_CTL_P) vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) + /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ + for (bank = 0; bank < bank_num; bank++) { vcpu->arch.mce_banks[bank*4] = ~(u64)0; + if (mcg_cap & MCG_CMCI_P) + vcpu->arch.mci_ctl2_banks[bank] = 0; + } vcpu->arch.apic->nr_lvt_entries = KVM_APIC_MAX_NR_LVT_ENTRIES - !(mcg_cap & MCG_CMCI_P); @@ -11449,7 +11497,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) + vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) goto fail_free_pio_data; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; @@ -11503,6 +11553,7 @@ free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); fail_free_pio_data: free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: @@ -11548,6 +11599,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); -- cgit v1.2.3 From aebc3ca19063d68b76bcaaca81558d4f180c61b0 Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:33 -0700 Subject: KVM: x86: Enable CMCI capability by default and handle injected UCNA errors This patch enables MCG_CMCI_P by default in kvm_mce_cap_supported. It reuses ioctl KVM_X86_SET_MCE to implement injection of UnCorrectable No Action required (UCNA) errors, signaled via Corrected Machine Check Interrupt (CMCI). Neither of the CMCI and UCNA emulations depends on hardware. Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-8-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/x86.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4002f6cc179d..c30115b9cb33 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8299,6 +8299,7 @@ static __init int hardware_setup(void) } kvm_caps.supported_mce_cap |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_CMCI_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f743e4ef2009..031678eff28e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4901,6 +4901,42 @@ out: return r; } +/* + * Validate this is an UCNA (uncorrectable no action) error by checking the + * MCG_STATUS and MCi_STATUS registers: + * - none of the bits for Machine Check Exceptions are set + * - both the VAL (valid) and UC (uncorrectable) bits are set + * MCI_STATUS_PCC - Processor Context Corrupted + * MCI_STATUS_S - Signaled as a Machine Check Exception + * MCI_STATUS_AR - Software recoverable Action Required + */ +static bool is_ucna(struct kvm_x86_mce *mce) +{ + return !mce->mcg_status && + !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) && + (mce->status & MCI_STATUS_VAL) && + (mce->status & MCI_STATUS_UC); +} + +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + + banks[1] = mce->status; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + + if (!(mcg_cap & MCG_CMCI_P) || + !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN)) + return 0; + + if (lapic_in_kernel(vcpu)) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI); + + return 0; +} + static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce) { @@ -4910,6 +4946,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) return -EINVAL; + + banks += array_index_nospec(4 * mce->bank, 4 * bank_num); + + if (is_ucna(mce)) + return kvm_vcpu_x86_set_ucna(vcpu, mce, banks); + /* * if IA32_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled @@ -4917,7 +4959,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && vcpu->arch.mcg_ctl != ~(u64)0) return 0; - banks += 4 * mce->bank; /* * if IA32_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank -- cgit v1.2.3 From 4de5c54f8c806621ab055eeb240877d75d53b208 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 24 Jun 2022 11:45:45 -0400 Subject: KVM: nVMX: clean up posted interrupt descriptor try_cmpxchg Rely on try_cmpxchg64 for re-reading the PID on failure, using READ_ONCE only right before the first iteration. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 73f60aa480fe..1b56c5e5c9fb 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -34,7 +34,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) +static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) { /* * PID.ON can be set at any time by a different vCPU or by hardware, @@ -42,7 +42,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) * update must be retried with a fresh snapshot an ON change causes * the cmpxchg to fail. */ - if (!try_cmpxchg64(&pi_desc->control, &old, new)) + if (!try_cmpxchg64(&pi_desc->control, pold, new)) return -EBUSY; return 0; @@ -96,8 +96,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!x2apic_mode) dest = (dest << 8) & 0xFF00; + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); + new.control = old.control; /* * Clear SN (as above) and refresh the destination APIC ID to @@ -111,7 +112,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * descriptor was modified on "put" to use the wakeup vector. */ new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); local_irq_restore(flags); @@ -156,12 +157,12 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); - /* set 'NV' to 'wakeup vector' */ + new.control = old.control; new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); /* * Send a wakeup IPI to this CPU if an interrupt may have been posted -- cgit v1.2.3 From aae99a7c9ab371b2cb402eebf62d4e70258f8a6d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:26:53 -0500 Subject: x86/cpufeatures: Introduce x2AVIC CPUID bit Introduce a new feature bit for virtualized x2APIC (x2AVIC) in CPUID_Fn8000000A_EDX [SVM Revision and Feature Identification]. Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Reviewed-by: Paolo Bonzini Message-Id: <20220519102709.24125-2-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 393f2bbb5e3a..6466a58b9cff 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -345,6 +345,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ -- cgit v1.2.3 From bf348f667ed36c0799dd6d10adb7be9cdfea48c3 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:26:54 -0500 Subject: KVM: x86: lapic: Rename [GET/SET]_APIC_DEST_FIELD to [GET/SET]_XAPIC_DEST_FIELD To signify that the macros only support 8-bit xAPIC destination ID. Suggested-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Reviewed-by: Pankaj Gupta Signed-off-by: Suravee Suthikulpanit Reviewed-by: Paolo Bonzini Message-Id: <20220519102709.24125-3-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/hyperv/hv_apic.c | 2 +- arch/x86/include/asm/apicdef.h | 4 ++-- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/ipi.c | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm/avic.c | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index db2d92fb44da..fb8b2c088681 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -46,7 +46,7 @@ static void hv_apic_icr_write(u32 low, u32 id) { u64 reg_val; - reg_val = SET_APIC_DEST_FIELD(id); + reg_val = SET_XAPIC_DEST_FIELD(id); reg_val = reg_val << 32; reg_val |= low; diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 92035eb3afee..68d213e83fcc 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -89,8 +89,8 @@ #define APIC_DM_EXTINT 0x00700 #define APIC_VECTOR_MASK 0x000FF #define APIC_ICR2 0x310 -#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) -#define SET_APIC_DEST_FIELD(x) ((x) << 24) +#define GET_XAPIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) +#define SET_XAPIC_DEST_FIELD(x) ((x) << 24) #define APIC_LVTT 0x320 #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 189d3a5e471a..a4347605ab00 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -275,7 +275,7 @@ void native_apic_icr_write(u32 low, u32 id) unsigned long flags; local_irq_save(flags); - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index d1fb874fbe64..2a6509e8c840 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -99,7 +99,7 @@ sendmask: static inline int __prepare_ICR2(unsigned int mask) { - return SET_APIC_DEST_FIELD(mask); + return SET_XAPIC_DEST_FIELD(mask); } static inline void __xapic_wait_icr_idle(void) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f03facc2ee3e..6f31cc0548f0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1326,7 +1326,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) if (apic_x2apic_mode(apic)) irq.dest_id = icr_high; else - irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); trace_kvm_apic_ipi(icr_low, irq.dest_id); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d1bc5820ea46..37016feb5ac4 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -303,7 +303,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (apic_x2apic_mode(source)) dest = icrh; else - dest = GET_APIC_DEST_FIELD(icrh); + dest = GET_XAPIC_DEST_FIELD(icrh); if (dest_mode == APIC_DEST_PHYSICAL) { /* broadcast destination, use slow path */ @@ -397,7 +397,7 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, */ kvm_for_each_vcpu(i, vcpu, kvm) { if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), + GET_XAPIC_DEST_FIELD(icrh), icrl & APIC_DEST_MASK)) { vcpu->arch.apic->irr_pending = true; svm_complete_interrupt_delivery(vcpu, -- cgit v1.2.3 From 4bdec12aa8d6d0fd7f1ac6740c456bf726e53c0b Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:26:55 -0500 Subject: KVM: SVM: Detect X2APIC virtualization (x2AVIC) support Add CPUID check for the x2APIC virtualization (x2AVIC) feature. If available, the SVM driver can support both AVIC and x2AVIC modes when load the kvm_amd driver with avic=1. The operating mode will be determined at runtime depending on the guest APIC mode. Reviewed-by: Maxim Levitsky Reviewed-by: Pankaj Gupta Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-4-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 3 +++ arch/x86/kvm/svm/avic.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 15 ++------------- arch/x86/kvm/svm/svm.h | 9 +++++++++ 4 files changed, 59 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1b07fba11704..1d4d30cd84c2 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -195,6 +195,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) +#define X2APIC_MODE_SHIFT 30 +#define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT) + #define LBR_CTL_ENABLE_MASK BIT_ULL(0) #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 37016feb5ac4..509163833cbc 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -40,6 +40,9 @@ #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +static bool force_avic; +module_param_unsafe(force_avic, bool, 0444); + /* Note: * This hash table is used to map VM_ID to a struct kvm_svm, * when handling AMD IOMMU GALOG notification to schedule in @@ -50,6 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); +enum avic_modes avic_mode; /* * This is a wrapper of struct amd_iommu_ir_data. @@ -1058,3 +1062,44 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_vcpu_load(vcpu, vcpu->cpu); } + +/* + * Note: + * - The module param avic enable both xAPIC and x2APIC mode. + * - Hypervisor can support both xAVIC and x2AVIC in the same guest. + * - The mode can be switched at run-time. + */ +bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +{ + if (!npt_enabled) + return false; + + if (boot_cpu_has(X86_FEATURE_AVIC)) { + avic_mode = AVIC_MODE_X1; + pr_info("AVIC enabled\n"); + } else if (force_avic) { + /* + * Some older systems does not advertise AVIC support. + * See Revision Guide for specific AMD processor for more detail. + */ + avic_mode = AVIC_MODE_X1; + pr_warn("AVIC is not supported in CPUID but force enabled"); + pr_warn("Your system might crash and burn"); + } + + /* AVIC is a prerequisite for x2AVIC. */ + if (boot_cpu_has(X86_FEATURE_X2AVIC)) { + if (avic_mode == AVIC_MODE_X1) { + avic_mode = AVIC_MODE_X2; + pr_info("x2AVIC enabled\n"); + } else { + pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); + pr_warn(FW_BUG "Try enable AVIC using force_avic option"); + } + } + + if (avic_mode != AVIC_MODE_NONE) + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + + return !!avic_mode; +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 136298cfb3fb..e45568aea060 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -188,9 +188,6 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); -static bool force_avic; -module_param_unsafe(force_avic, bool, 0444); - bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -5016,17 +5013,9 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic); + enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); - if (enable_apicv) { - if (!boot_cpu_has(X86_FEATURE_AVIC)) { - pr_warn("AVIC is not supported in CPUID but force enabled"); - pr_warn("Your system might crash and burn"); - } else - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } else { + if (!enable_apicv) { svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d51de3c9264a..6d6390f785c5 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -36,6 +36,14 @@ extern bool npt_enabled; extern int vgif; extern bool intercept_smi; +enum avic_modes { + AVIC_MODE_NONE = 0, + AVIC_MODE_X1, + AVIC_MODE_X2, +}; + +extern enum avic_modes avic_mode; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to @@ -607,6 +615,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ +bool avic_hardware_setup(struct kvm_x86_ops *ops); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); -- cgit v1.2.3 From d2fe6bf5b881205ae99e858ed9b7ba1ebdb9c029 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:26:56 -0500 Subject: KVM: SVM: Update max number of vCPUs supported for x2AVIC mode xAVIC and x2AVIC modes can support diffferent number of vcpus. Update existing logics to support each mode accordingly. Also, modify the maximum physical APIC ID for AVIC to 255 to reflect the actual value supported by the architecture. Reviewed-by: Maxim Levitsky Reviewed-by: Pankaj Gupta Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-5-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 12 +++++++++--- arch/x86/kvm/svm/avic.c | 8 +++++--- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1d4d30cd84c2..ed4b83d7f09e 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -258,10 +258,16 @@ enum avic_ipi_failure_cause { /* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. + * For AVIC, the max index allowed for physical APIC ID + * table is 0xff (255). */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff +#define AVIC_MAX_PHYSICAL_ID 0XFEULL + +/* + * For x2AVIC, the max index allowed for physical APIC ID + * table is 0x1ff (511). + */ +#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 509163833cbc..15dc6ca2c498 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -179,7 +179,7 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; if (kvm_apicv_activated(svm->vcpu.kvm)) @@ -194,7 +194,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, u64 *avic_physical_id_table; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID)) return NULL; avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); @@ -241,7 +242,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID)) return -EINVAL; if (!vcpu->arch.apic->regs) -- cgit v1.2.3 From c514d3a348ac4358215fdfb9ed17d4c01dd39731 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:26:57 -0500 Subject: KVM: SVM: Update avic_kick_target_vcpus to support 32-bit APIC ID In x2APIC mode, ICRH contains 32-bit destination APIC ID. So, update the avic_kick_target_vcpus() accordingly. Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-6-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 15dc6ca2c498..8b8a221cd635 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -402,9 +402,15 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, * since entered the guest will have processed pending IRQs at VMRUN. */ kvm_for_each_vcpu(i, vcpu, kvm) { + u32 dest; + + if (apic_x2apic_mode(vcpu->arch.apic)) + dest = icrh; + else + dest = GET_XAPIC_DEST_FIELD(icrh); + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, - GET_XAPIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK)) { + dest, icrl & APIC_DEST_MASK)) { vcpu->arch.apic->irr_pending = true; svm_complete_interrupt_delivery(vcpu, icrl & APIC_MODE_MASK, -- cgit v1.2.3 From ab1b1dc131cd7da4c66758e711c0f69da3633561 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:26:58 -0500 Subject: KVM: SVM: Do not support updating APIC ID when in x2APIC mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In X2APIC mode, the Logical Destination Register is read-only, which provides a fixed mapping between the logical and physical APIC IDs. Therefore, there is no Logical APIC ID table in X2AVIC and the processor uses the X2APIC ID in the backing page to create a vCPU’s logical ID. In addition, KVM does not support updating APIC ID in x2APIC mode, which means AVIC does not need to handle this case. Therefore, check x2APIC mode when handling physical and logical APIC ID update, and when invalidating logical APIC ID table. Reviewed-by: Maxim Levitsky Suggested-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-7-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8b8a221cd635..cc4da232040c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -526,8 +526,13 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool flat = svm->dfr_reg == APIC_DFR_FLAT; - u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + u32 *entry; + /* Note: x2AVIC does not use logical APIC ID table */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return; + + entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); if (entry) clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } @@ -539,6 +544,10 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); u32 id = kvm_xapic_id(vcpu->arch.apic); + /* AVIC does not support LDR update for x2APIC */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return 0; + if (ldr == svm->ldr_reg) return 0; -- cgit v1.2.3 From 5c127c85472c0c4c9d1f88e2807adcab9335d97c Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:26:59 -0500 Subject: KVM: SVM: Adding support for configuring x2APIC MSRs interception When enabling x2APIC virtualization (x2AVIC), the interception of x2APIC MSRs must be disabled to let the hardware virtualize guest MSR accesses. Current implementation keeps track of list of MSR interception state in the svm_direct_access_msrs array. Therefore, extends the array to include x2APIC MSRs. Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-8-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 25 +++++++++++++++++++++++++ arch/x86/kvm/svm/svm.h | 4 ++-- 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e45568aea060..382edac728c7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -100,6 +100,31 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, { .index = MSR_TSC_AUX, .always = false }, + { .index = (APIC_BASE_MSR + APIC_ID), .always = false }, + { .index = (APIC_BASE_MSR + APIC_LVR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_TASKPRI), .always = false }, + { .index = (APIC_BASE_MSR + APIC_ARBPRI), .always = false }, + { .index = (APIC_BASE_MSR + APIC_PROCPRI), .always = false }, + { .index = (APIC_BASE_MSR + APIC_EOI), .always = false }, + { .index = (APIC_BASE_MSR + APIC_RRR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_LDR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_DFR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_SPIV), .always = false }, + { .index = (APIC_BASE_MSR + APIC_ISR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_TMR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_IRR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_ESR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_ICR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_ICR2), .always = false }, + { .index = (APIC_BASE_MSR + APIC_LVTT), .always = false }, + { .index = (APIC_BASE_MSR + APIC_LVTTHMR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_LVTPC), .always = false }, + { .index = (APIC_BASE_MSR + APIC_LVT0), .always = false }, + { .index = (APIC_BASE_MSR + APIC_LVT1), .always = false }, + { .index = (APIC_BASE_MSR + APIC_LVTERR), .always = false }, + { .index = (APIC_BASE_MSR + APIC_TMICT), .always = false }, + { .index = (APIC_BASE_MSR + APIC_TMCCT), .always = false }, + { .index = (APIC_BASE_MSR + APIC_TDCR), .always = false }, { .index = MSR_INVALID, .always = false }, }; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6d6390f785c5..172c2a37e60e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -29,8 +29,8 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 21 -#define MSRPM_OFFSETS 16 +#define MAX_DIRECT_ACCESS_MSRS 46 +#define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int vgif; -- cgit v1.2.3 From 8fc9c7a3079e2d2e72dfb2f4e9bb2b1fc67f0ee6 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:00 -0500 Subject: KVM: x86: Deactivate APICv on vCPU with APIC disabled APICv should be deactivated on vCPU that has APIC disabled. Therefore, call kvm_vcpu_update_apicv() when changing APIC mode, and add additional check for APIC disable mode when determine APICV activation, Suggested-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-9-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +++- arch/x86/kvm/x86.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6f31cc0548f0..6ff17d5a2ae3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2371,8 +2371,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); - if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) + if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { + kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); + } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 031678eff28e..47981e97d686 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10065,7 +10065,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) down_read(&vcpu->kvm->arch.apicv_update_lock); preempt_disable(); - activate = kvm_vcpu_apicv_activated(vcpu); + /* Do not activate APICV when APIC is disabled */ + activate = kvm_vcpu_apicv_activated(vcpu) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED); if (apic->apicv_active == activate) goto out; -- cgit v1.2.3 From 05c4fe8c1bd94c6f59a8b82cbd535b1aea55945f Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:01 -0500 Subject: KVM: SVM: Refresh AVIC configuration when changing APIC mode AMD AVIC can support xAPIC and x2APIC virtualization, which requires changing x2APIC bit VMCB and MSR intercepton for x2APIC MSRs. Therefore, call avic_refresh_apicv_exec_ctrl() to refresh configuration accordingly. Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-10-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 12 ++++++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/svm/svm.h | 2 ++ 3 files changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index cc4da232040c..eb7060d3327b 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -675,6 +675,18 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) + return; + + if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) { + WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id); + return; + } + avic_refresh_apicv_exec_ctrl(vcpu); +} + static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 382edac728c7..67ba0d3a7f62 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4793,6 +4793,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + .set_virtual_apic_mode = avic_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 172c2a37e60e..6d89842853cf 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -634,6 +634,8 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); + /* sev.c */ -- cgit v1.2.3 From 7a8f7c1f3434afc94b6895eff713e0fcea01e3b4 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 19 May 2022 05:27:02 -0500 Subject: KVM: x86: nSVM: always intercept x2apic msrs As a preparation for x2avic, this patch ensures that x2apic msrs are always intercepted for the nested guest. Reviewed-by: Suravee Suthikulpanit Tested-by: Suravee Suthikulpanit Signed-off-by: Maxim Levitsky Message-Id: <20220519102709.24125-11-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 5 +++++ arch/x86/kvm/svm/svm.h | 9 +++++++++ 2 files changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 83bae1f2eeb8..adf4120b05d9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -230,6 +230,11 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) break; p = msrpm_offsets[i]; + + /* x2apic msrs are intercepted always for the nested guest */ + if (is_x2apic_msrpm_offset(p)) + continue; + offset = svm->nested.ctl.msrpm_base_pa + (p * 4); if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6d89842853cf..288a1384aa0f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -521,6 +521,15 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } +static inline bool is_x2apic_msrpm_offset(u32 offset) +{ + /* 4 msrs per u8, and 4 u8 in u32 */ + u32 msr = offset * 16; + + return (msr >= APIC_BASE_MSR) && + (msr < (APIC_BASE_MSR + 0x100)); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU -- cgit v1.2.3 From 4d1d7942e36add0aa741a62d0c8e3aba2d5b3ab1 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:03 -0500 Subject: KVM: SVM: Introduce logic to (de)activate x2AVIC mode Introduce logic to (de)activate AVIC, which also allows switching between AVIC to x2AVIC mode at runtime. When an AVIC-enabled guest switches from APIC to x2APIC mode, the SVM driver needs to perform the following steps: 1. Set the x2APIC mode bit for AVIC in VMCB along with the maximum APIC ID support for each mode accodingly. 2. Disable x2APIC MSRs interception in order to allow the hardware to virtualize x2APIC MSRs accesses. Reported-by: kernel test robot Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-12-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 1 + arch/x86/kvm/svm/avic.c | 39 ++++++++++++++++++++++++++++++++++----- arch/x86/kvm/svm/svm.c | 18 ++++++++++++++++++ arch/x86/kvm/svm/svm.h | 1 + 4 files changed, 54 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index ed4b83d7f09e..0361626841bc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -256,6 +256,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, }; +#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(9, 0) /* * For AVIC, the max index allowed for physical APIC ID diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index eb7060d3327b..c7db96428b19 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -63,6 +63,36 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; +static void avic_activate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + if (apic_x2apic_mode(svm->vcpu.arch.apic)) { + vmcb->control.int_ctl |= X2APIC_MODE_MASK; + vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + /* Disabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, false); + } else { + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); + } +} + +static void avic_deactivate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); +} /* Note: * This function is called from IOMMU driver to notify @@ -179,13 +209,12 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; if (kvm_apicv_activated(svm->vcpu.kvm)) - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); else - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, @@ -1049,9 +1078,9 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) * accordingly before re-activating. */ avic_apicv_post_state_restore(vcpu); - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); } else { - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } vmcb_mark_dirty(vmcb, VMCB_AVIC); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 67ba0d3a7f62..491e4d549e2f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -805,6 +805,24 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) } } +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) +{ + int i; + + if (avic_mode != AVIC_MODE_X2 || + !apic_x2apic_mode(svm->vcpu.arch.apic)) + return; + + for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { + int index = direct_access_msrs[i].index; + + if ((index < APIC_BASE_MSR) || + (index > APIC_BASE_MSR + 0xff)) + continue; + set_msr_interception(&svm->vcpu, svm->msrpm, index, + !intercept, !intercept); + } +} void svm_vcpu_free_msrpm(u32 *msrpm) { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 288a1384aa0f..ccaae7d160cd 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -555,6 +555,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); -- cgit v1.2.3 From c0caeee65af3944b7b8abbf566e7cc1fae15c775 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:04 -0500 Subject: KVM: SVM: Do not throw warning when calling avic_vcpu_load on a running vcpu Originalliy, this WARN_ON is designed to detect when calling avic_vcpu_load() on an already running vcpu in AVIC mode (i.e. the AVIC is_running bit is set). However, for x2AVIC, the vCPU can switch from xAPIC to x2APIC mode while in running state, in which the avic_vcpu_load() will be called from svm_refresh_apicv_exec_ctrl(). Therefore, remove this warning since it is no longer appropriate. Reviewed-by: Maxim Levitsky Reviewed-by: Pankaj Gupta Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-13-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index c7db96428b19..fcff249e73cd 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1030,7 +1030,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); -- cgit v1.2.3 From 0e311d33bfbef86da130674e8528cc23e6acfe16 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:05 -0500 Subject: KVM: SVM: Introduce hybrid-AVIC mode Currently, AVIC is inhibited when booting a VM w/ x2APIC support. because AVIC cannot virtualize x2APIC MSR register accesses. However, the AVIC doorbell can be used to accelerate interrupt injection into a running vCPU, while all guest accesses to x2APIC MSRs will be intercepted and emulated by KVM. With hybrid-AVIC support, the APICV_INHIBIT_REASON_X2APIC is no longer enforced. Suggested-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-14-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 ----- arch/x86/kvm/svm/avic.c | 13 +++++++++++-- arch/x86/kvm/svm/svm.c | 9 --------- 3 files changed, 11 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 88a3026ee163..de5a149d0971 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1129,11 +1129,6 @@ enum kvm_apicv_inhibit { */ APICV_INHIBIT_REASON_PIT_REINJ, - /* - * AVIC is inhibited because the guest has x2apic in its CPUID. - */ - APICV_INHIBIT_REASON_X2APIC, - /* * AVIC is disabled because SEV doesn't support it. */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index fcff249e73cd..a8db5d2532bc 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -71,12 +71,22 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - if (apic_x2apic_mode(svm->vcpu.arch.apic)) { + + /* Note: + * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC + * MSR accesses, while interrupt injection to a running vCPU + * can be achieved using AVIC doorbell. The AVIC hardware still + * accelerate MMIO accesses, but this does not cause any harm + * as the guest is not supposed to access xAPIC mmio when uses x2APIC. + */ + if (apic_x2apic_mode(svm->vcpu.arch.apic) && + avic_mode == AVIC_MODE_X2) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; /* Disabling MSR intercept for x2APIC registers */ svm_set_x2apic_msr_interception(svm, false); } else { + /* For xAVIC and hybrid-xAVIC modes */ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; /* Enabling MSR intercept for x2APIC registers */ svm_set_x2apic_msr_interception(svm, true); @@ -968,7 +978,6 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | BIT(APICV_INHIBIT_REASON_SEV) | BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 491e4d549e2f..bb0457c1e41c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4160,7 +4160,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; - struct kvm *kvm = vcpu->kvm; vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVE) && @@ -4192,14 +4191,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } - if (kvm_vcpu_apicv_active(vcpu)) { - /* - * AVIC does not work with an x2APIC mode guest. If the X2APIC feature - * is exposed to the guest, disable AVIC. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) - kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); - } init_vmcb_after_set_cpuid(vcpu); } -- cgit v1.2.3 From f8d8ac215919cf40d3b358846ed8e8e3a3683131 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:06 -0500 Subject: KVM: x86: Warning APICv inconsistency only when vcpu APIC mode is valid When launching a VM with x2APIC and specify more than 255 vCPUs, the guest kernel can disable x2APIC (e.g. specify nox2apic kernel option). The VM fallbacks to xAPIC mode, and disable the vCPU ID 255 and greater. In this case, APICV is deactivated for the disabled vCPUs. However, the current APICv consistency warning does not account for this case, which results in a warning. Therefore, modify warning logic to report only when vCPU APIC mode is valid. Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-15-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47981e97d686..acd7ce1dcb9f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10477,7 +10477,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * per-VM state, and responsing vCPUs must wait for the update * to complete before servicing KVM_REQ_APICV_UPDATE. */ - WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)); + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) -- cgit v1.2.3 From 8c9e639da435874fb845c4d296ce55664071ea7a Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:07 -0500 Subject: KVM: SVM: Use target APIC ID to complete x2AVIC IRQs when possible For x2AVIC, the index from incomplete IPI #vmexit info is invalid for logical cluster mode. Only ICRH/ICRL values can be used to determine the IPI destination APIC ID. Since QEMU defines guest physical APIC ID to be the same as vCPU ID, it can be used to quickly identify the target vCPU to deliver IPI, and avoid the overhead from searching through all vCPUs to match the target vCPU. Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-16-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a8db5d2532bc..229de5fe665d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -390,9 +390,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source logid_index = cluster + __ffs(bitmap); - if (apic_x2apic_mode(source)) { - l1_physical_id = logid_index; - } else { + if (!apic_x2apic_mode(source)) { u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); @@ -407,6 +405,23 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source l1_physical_id = logid_entry & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } else { + /* + * For x2APIC logical mode, cannot leverage the index. + * Instead, calculate physical ID from logical ID in ICRH. + */ + int cluster = (icrh & 0xffff0000) >> 16; + int apic = ffs(icrh & 0xffff) - 1; + + /* + * If the x2APIC logical ID sub-field (i.e. icrh[15:0]) + * contains anything but a single bit, we cannot use the + * fast path, because it is limited to a single vCPU. + */ + if (apic < 0 || icrh != (1 << apic)) + return -EINVAL; + + l1_physical_id = (cluster << 4) + apic; } } -- cgit v1.2.3 From 39b6b8c35cf37970e07e9081c0e7d1083930b2f7 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:08 -0500 Subject: KVM: SVM: Add AVIC doorbell tracepoint Add a tracepoint to track number of doorbells being sent to signal a running vCPU to process IRQ after being injected. Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-17-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 4 +++- arch/x86/kvm/trace.h | 18 ++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 229de5fe665d..f18c852b6900 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -324,8 +324,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) */ int cpu = READ_ONCE(vcpu->cpu); - if (cpu != get_cpu()) + if (cpu != get_cpu()) { wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); + } put_cpu(); } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index fd28dd40b813..bc85622e28b2 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1490,6 +1490,24 @@ TRACE_EVENT(kvm_avic_kick_vcpu_slowpath, __entry->icrh, __entry->icrl, __entry->index) ); +TRACE_EVENT(kvm_avic_doorbell, + TP_PROTO(u32 vcpuid, u32 apicid), + TP_ARGS(vcpuid, apicid), + + TP_STRUCT__entry( + __field(u32, vcpuid) + __field(u32, apicid) + ), + + TP_fast_assign( + __entry->vcpuid = vcpuid; + __entry->apicid = apicid; + ), + + TP_printk("vcpuid=%u, apicid=%u", + __entry->vcpuid, __entry->apicid) +); + TRACE_EVENT(kvm_hv_timer_state, TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), TP_ARGS(vcpu_id, hv_timer_in_use), diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index acd7ce1dcb9f..d88600e41ff8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13328,6 +13328,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); -- cgit v1.2.3 From 091abbf578f926e763adc0f577baeb7f405b4bdc Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 19 May 2022 05:27:09 -0500 Subject: KVM: x86: nSVM: optimize svm_set_x2apic_msr_interception - Avoid toggling the x2apic msr interception if it is already up to date. - Avoid touching L0 msr bitmap when AVIC is inhibited on entry to the guest mode, because in this case the guest usually uses its own msr bitmap. Later on VM exit, the 1st optimization will allow KVM to skip touching the L0 msr bitmap as well. Reviewed-by: Suravee Suthikulpanit Tested-by: Suravee Suthikulpanit Signed-off-by: Maxim Levitsky Message-Id: <20220519102709.24125-18-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 8 ++++++++ arch/x86/kvm/svm/svm.c | 7 +++++++ arch/x86/kvm/svm/svm.h | 2 ++ 3 files changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f18c852b6900..6919dee69f18 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -100,6 +100,14 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm) vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + /* + * If running nested and the guest uses its own MSR bitmap, there + * is no need to update L0's msr bitmap + */ + if (is_guest_mode(&svm->vcpu) && + vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) + return; + /* Enabling MSR intercept for x2APIC registers */ svm_set_x2apic_msr_interception(svm, true); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bb0457c1e41c..37ce061dfc76 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -809,6 +809,9 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) { int i; + if (intercept == svm->x2avic_msrs_intercepted) + return; + if (avic_mode != AVIC_MODE_X2 || !apic_x2apic_mode(svm->vcpu.arch.apic)) return; @@ -822,6 +825,8 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) set_msr_interception(&svm->vcpu, svm->msrpm, index, !intercept, !intercept); } + + svm->x2avic_msrs_intercepted = intercept; } void svm_vcpu_free_msrpm(u32 *msrpm) @@ -1393,6 +1398,8 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) goto error_free_vmsa_page; } + svm->x2avic_msrs_intercepted = true; + svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index ccaae7d160cd..558ca1296d36 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -276,6 +276,8 @@ struct vcpu_svm { struct vcpu_sev_es_state sev_es; bool guest_state_loaded; + + bool x2avic_msrs_intercepted; }; struct svm_cpu_data { -- cgit v1.2.3 From 7a6177d6f344941410faa51f3e46ef242c54b406 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2022 13:32:29 -0400 Subject: KVM: x86: complete fast IN directly with complete_emulator_pio_in() Use complete_emulator_pio_in() directly when completing fast PIO, there's no need to bounce through emulator_pio_in(): the comment about ECX changing doesn't apply to fast PIO, which isn't used for string I/O. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d88600e41ff8..d66a873f4427 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8871,11 +8871,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* For size less than 4 we merge, else we zero extend */ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; - /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform - * the copy and tracing - */ - emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); + complete_emulator_pio_in(vcpu, &val); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); -- cgit v1.2.3 From 0f87ac234d98c68e4b2bfde6a8bcdeef3d3f54b7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 06:50:06 -0400 Subject: KVM: x86: inline kernel_pio into its sole caller The caller of kernel_pio already has arguments for most of what kernel_pio fishes out of vcpu->arch.pio. This is the first step towards ensuring that vcpu->arch.pio.* is only used when exiting to userspace. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d66a873f4427..524a96d26399 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7575,37 +7575,31 @@ emul_write: return emulator_write_emulated(ctxt, addr, new, bytes, exception); } -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) -{ - int r = 0, i; - - for (i = 0; i < vcpu->arch.pio.count; i++) { - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - if (r) - break; - pd += vcpu->arch.pio.size; - } - return r; -} - static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, unsigned short port, unsigned int count, bool in) { + void *data = vcpu->arch.pio_data; + unsigned i; + int r; + vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; + vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) - return 1; + for (i = 0; i < count; i++) { + if (in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); + else + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); + if (r) + goto userspace_io; + data += size; + } + return 1; +userspace_io: vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = size; -- cgit v1.2.3 From 35ab3b77a0ae76246dd2666d4f8190c824392fb4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2022 11:05:06 -0400 Subject: KVM: x86: drop PIO from unregistered devices KVM protects the device list with SRCU, and therefore different calls to kvm_io_bus_read()/kvm_io_bus_write() can very well see different incarnations of kvm->buses. If userspace unregisters a device while vCPUs are running there is no well-defined result. This patch applies a safe fallback by returning early from emulator_pio_in_out(). This corresponds to returning zeroes from IN, and dropping the writes on the floor for OUT. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 524a96d26399..5a56d39bd81f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7593,8 +7593,19 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); else r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); - if (r) - goto userspace_io; + + if (r) { + if (i == 0) + goto userspace_io; + + /* + * Userspace must have unregistered the device while PIO + * was running. Drop writes / read as 0 (the buffer + * was zeroed in __emulator_pio_in). + */ + break; + } + data += size; } return 1; @@ -7606,7 +7617,6 @@ userspace_io: vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; - return 0; } -- cgit v1.2.3 From 30d583fd4e1ecafa8642f9f74e102876b4aeb733 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 08:07:19 -0400 Subject: KVM: x86: move all vcpu->arch.pio* setup in emulator_pio_in_out() For now, this is basically an excuse to add back the void* argument to the function, while removing some knowledge of vcpu->arch.pio* from its callers. The WARN that vcpu->arch.pio.count is zero is also extended to OUT operations. The vcpu->arch.pio* fields still need to be filled even when the PIO is handled in-kernel as __emulator_pio_in() is always followed by complete_emulator_pio_in(). But after fixing that, it will be possible to to only populate the vcpu->arch.pio* fields on userspace exits. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 2 +- arch/x86/kvm/x86.c | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bc85622e28b2..2120d7c060a9 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall, TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count, void *data), + unsigned int count, const void *data), TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a56d39bd81f..368d0d4d56ff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7576,17 +7576,25 @@ emul_write: } static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, + unsigned short port, void *data, unsigned int count, bool in) { - void *data = vcpu->arch.pio_data; unsigned i; int r; + WARN_ON_ONCE(vcpu->arch.pio.count); vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; + if (in) { + /* The buffer is only used in complete_emulator_pio_in(). */ + WARN_ON(data); + memset(vcpu->arch.pio_data, 0, size * count); + } else { + memcpy(vcpu->arch.pio_data, data, size * count); + } + data = vcpu->arch.pio_data; for (i = 0; i < count; i++) { if (in) @@ -7623,9 +7631,7 @@ userspace_io: static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port, unsigned int count) { - WARN_ON(vcpu->arch.pio.count); - memset(vcpu->arch.pio_data, 0, size * count); - return emulator_pio_in_out(vcpu, size, port, count, true); + return emulator_pio_in_out(vcpu, size, port, NULL, count, true); } static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) @@ -7674,9 +7680,8 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, { int ret; - memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - ret = emulator_pio_in_out(vcpu, size, port, count, false); + trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); + ret = emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); if (ret) vcpu->arch.pio.count = 0; -- cgit v1.2.3 From 0c05e10bce5217657bcdaa9787c9ea94e6b384b2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2022 10:24:01 -0400 Subject: KVM: x86: wean in-kernel PIO from vcpu->arch.pio* Make emulator_pio_in_out operate directly on the provided buffer as long as PIO is handled inside KVM. For input operations, this means that, in the case of in-kernel PIO, __emulator_pio_in() does not have to be always followed by complete_emulator_pio_in(). This affects emulator_pio_in() and kvm_sev_es_ins(); for the latter, that is why the call moves from advance_sev_es_emulated_ins() to complete_sev_es_emulated_ins(). For output, it means that vcpu->pio.count is never set unnecessarily and there is no need to clear it; but also vcpu->pio.size must not be used in kvm_sev_es_outs(), because it will not be updated for in-kernel OUT. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 73 ++++++++++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 368d0d4d56ff..716ae5c92687 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7583,19 +7583,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, int r; WARN_ON_ONCE(vcpu->arch.pio.count); - vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; - vcpu->arch.pio.size = size; - if (in) { - /* The buffer is only used in complete_emulator_pio_in(). */ - WARN_ON(data); - memset(vcpu->arch.pio_data, 0, size * count); - } else { - memcpy(vcpu->arch.pio_data, data, size * count); - } - data = vcpu->arch.pio_data; - for (i = 0; i < count; i++) { if (in) r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); @@ -7608,9 +7595,10 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, /* * Userspace must have unregistered the device while PIO - * was running. Drop writes / read as 0 (the buffer - * was zeroed in __emulator_pio_in). + * was running. Drop writes / read as 0. */ + if (in) + memset(data, 0, size * (count - i)); break; } @@ -7619,6 +7607,16 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 1; userspace_io: + vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.count = count; + vcpu->arch.pio.size = size; + + if (in) + memset(vcpu->arch.pio_data, 0, size * count); + else + memcpy(vcpu->arch.pio_data, data, size * count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = size; @@ -7629,15 +7627,19 @@ userspace_io: } static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, unsigned int count) + unsigned short port, void *val, unsigned int count) { - return emulator_pio_in_out(vcpu, size, port, NULL, count, true); + int r = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (r) + trace_kvm_pio(KVM_PIO_IN, port, size, count, val); + + return r; } static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) { int size = vcpu->arch.pio.size; - unsigned count = vcpu->arch.pio.count; + unsigned int count = vcpu->arch.pio.count; memcpy(val, vcpu->arch.pio_data, size * count); trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; @@ -7654,16 +7656,11 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, * shenanigans as KVM doesn't support modifying the rep count, * and the emulator ensures @count doesn't overflow the buffer. */ - } else { - int r = __emulator_pio_in(vcpu, size, port, count); - if (!r) - return r; - - /* Results already available, fall through. */ + complete_emulator_pio_in(vcpu, val); + return 1; } - complete_emulator_pio_in(vcpu, val); - return 1; + return __emulator_pio_in(vcpu, size, port, val, count); } static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, @@ -7678,14 +7675,8 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { - int ret; - trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); - ret = emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); - if (ret) - vcpu->arch.pio.count = 0; - - return ret; + return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -13245,7 +13236,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, /* memcpy done already by emulator_pio_out. */ vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + vcpu->arch.sev_pio_data += count * size; if (!ret) break; @@ -13261,20 +13252,20 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); -static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu, unsigned count, int size) { - unsigned count = vcpu->arch.pio.count; - complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + vcpu->arch.sev_pio_data += count * size; } static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { + unsigned count = vcpu->arch.pio.count; int size = vcpu->arch.pio.size; int port = vcpu->arch.pio.port; - advance_sev_es_emulated_ins(vcpu); + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + advance_sev_es_emulated_ins(vcpu, count, size); if (vcpu->arch.sev_pio_count) return kvm_sev_es_ins(vcpu, size, port); return 1; @@ -13286,11 +13277,11 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, for (;;) { unsigned int count = min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); - if (!__emulator_pio_in(vcpu, size, port, count)) + if (!__emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) break; /* Emulation done by the kernel. */ - advance_sev_es_emulated_ins(vcpu); + advance_sev_es_emulated_ins(vcpu, count, size); if (!vcpu->arch.sev_pio_count) return 1; } -- cgit v1.2.3 From dc7a4bfde507ffe1d8bef49aba1322f1d20c2cb3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 08:01:36 -0400 Subject: KVM: x86: wean fast IN from emulator_pio_in Use __emulator_pio_in() directly for fast PIO instead of bouncing through emulator_pio_in() now that __emulator_pio_in() fills "val" when handling in-kernel PIO. vcpu->arch.pio.count is guaranteed to be '0', so this a pure nop. emulator_pio_in_emulated is now the last caller of emulator_pio_in. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 716ae5c92687..b824ffc63b17 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8886,7 +8886,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, /* For size less than 4 we merge, else we zero extend */ val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = emulator_pio_in(vcpu, size, port, &val, 1); + ret = __emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { kvm_rax_write(vcpu, val); return ret; -- cgit v1.2.3 From f35cee4adb54af59129193d528706b390befb5f4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 08:19:48 -0400 Subject: KVM: x86: de-underscorify __emulator_pio_in Now all callers except emulator_pio_in_emulated are using __emulator_pio_in/complete_emulator_pio_in explicitly. Move the "either copy the result or attempt PIO" logic in emulator_pio_in_emulated, and rename __emulator_pio_in to just emulator_pio_in. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b824ffc63b17..1a017a5a680b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7626,8 +7626,8 @@ userspace_io: return 0; } -static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { int r = emulator_pio_in_out(vcpu, size, port, val, count, true); if (r) @@ -7645,9 +7645,11 @@ static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) vcpu->arch.pio.count = 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); if (vcpu->arch.pio.count) { /* * Complete a previous iteration that required userspace I/O. @@ -7660,15 +7662,7 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, return 1; } - return __emulator_pio_in(vcpu, size, port, val, count); -} - -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) -{ - return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); - + return emulator_pio_in(vcpu, size, port, val, count); } static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, @@ -8886,7 +8880,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, /* For size less than 4 we merge, else we zero extend */ val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = __emulator_pio_in(vcpu, size, port, &val, 1); + ret = emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { kvm_rax_write(vcpu, val); return ret; @@ -13277,7 +13271,7 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, for (;;) { unsigned int count = min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); - if (!__emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) + if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) break; /* Emulation done by the kernel. */ -- cgit v1.2.3 From db209369d48eab1e2c724d90c606752d398ae334 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 08:47:56 -0400 Subject: KVM: SEV-ES: reuse advance_sev_es_emulated_ins for OUT too complete_emulator_pio_in() only has to be called by complete_sev_es_emulated_ins() now; therefore, all that the function does now is adjust sev_pio_count and sev_pio_data. Which is the same for both IN and OUT. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1a017a5a680b..567d13405445 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13206,6 +13206,12 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) +{ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * size; +} + static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); @@ -13229,8 +13235,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); /* memcpy done already by emulator_pio_out. */ - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * size; + advance_sev_es_emulated_pio(vcpu, count, size); if (!ret) break; @@ -13246,12 +13251,6 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); -static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu, unsigned count, int size) -{ - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * size; -} - static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { unsigned count = vcpu->arch.pio.count; @@ -13259,7 +13258,7 @@ static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) int port = vcpu->arch.pio.port; complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); - advance_sev_es_emulated_ins(vcpu, count, size); + advance_sev_es_emulated_pio(vcpu, count, size); if (vcpu->arch.sev_pio_count) return kvm_sev_es_ins(vcpu, size, port); return 1; @@ -13275,7 +13274,7 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, break; /* Emulation done by the kernel. */ - advance_sev_es_emulated_ins(vcpu, count, size); + advance_sev_es_emulated_pio(vcpu, count, size); if (!vcpu->arch.sev_pio_count) return 1; } -- cgit v1.2.3 From 72ae5822b81a6686c4b4d526ccdd7b7f5f0f9b97 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 24 Jun 2022 17:18:07 +0000 Subject: KVM: x86/mmu: Use "unsigned int", not "u32", for SPTEs' @access info Use an "unsigned int" for @access parameters instead of a "u32", mostly to be consistent throughout KVM, but also because "u32" is misleading. @access can actually squeeze into a u8, i.e. doesn't need 32 bits, but is as an "unsigned int" because sp->role.access is an unsigned int. No functional change intended. Link: https://lore.kernel.org/all/YqyZxEfxXLsHGoZ%2F@google.com Cc: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20220624171808.2845941-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bd74a287b54a..d5554cd61492 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -717,7 +717,8 @@ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) return sp->role.access; } -static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, u32 access) +static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, + gfn_t gfn, unsigned int access) { if (sp_has_gptes(sp)) { sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; @@ -735,7 +736,8 @@ static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); } -static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, u32 access) +static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, + unsigned int access) { gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); @@ -1580,7 +1582,7 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, static void __rmap_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, const struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn, u32 access) + u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; @@ -1601,7 +1603,7 @@ static void __rmap_add(struct kvm *kvm, } static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn, u32 access) + u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; -- cgit v1.2.3 From b9b71f43683ae9d76b0989249607bbe8c9eb6c5c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 24 Jun 2022 17:18:08 +0000 Subject: KVM: x86/mmu: Buffer nested MMU split_desc_cache only by default capacity Buffer split_desc_cache, the cache used to allcoate rmap list entries, only by the default cache capacity (currently 40), not by doubling the minimum (513). Aliasing L2 GPAs to L1 GPAs is uncommon, thus eager page splitting is unlikely to need 500+ entries. And because each object is a non-trivial 128 bytes (see struct pte_list_desc), those extra ~500 entries means KVM is in all likelihood wasting ~64kb of memory per VM. Link: https://lore.kernel.org/all/YrTDcrsn0%2F+alpzf@google.com Cc: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20220624171808.2845941-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d5554cd61492..f7fa4c31b7c5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6118,17 +6118,25 @@ static bool need_topup_split_caches_or_resched(struct kvm *kvm) static int topup_split_caches(struct kvm *kvm) { + /* + * Allocating rmap list entries when splitting huge pages for nested + * MMUs is uncommon as KVM needs to allocate if and only if there is + * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be + * aliased by multiple L2 gfns. Aliasing gfns when using TDP is very + * atypical for VMMs; a few gfns are often aliased during boot, e.g. + * when remapping firmware, but aliasing rarely occurs post-boot). If + * there is only one rmap entry, rmap->val points directly at that one + * entry and doesn't need to allocate a list. Buffer the cache by the + * default capacity so that KVM doesn't have to topup the cache if it + * encounters an aliased gfn or two. + */ + const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + + KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; int r; lockdep_assert_held(&kvm->slots_lock); - /* - * Setting capacity == min would cause KVM to drop mmu_lock even if - * just one object was consumed from the cache, so make capacity - * larger than min. - */ - r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, - 2 * SPLIT_DESC_CACHE_MIN_NR_OBJECTS, + r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, SPLIT_DESC_CACHE_MIN_NR_OBJECTS); if (r) return r; -- cgit v1.2.3 From 2368048bf5c2ec4b604ac3431564071e89a0bc71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 May 2022 22:27:14 +0000 Subject: KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS) Return '1', not '-1', when handling an illegal WRMSR to a MCi_CTL or MCi_STATUS MSR. The behavior of "all zeros' or "all ones" for CTL MSRs is architectural, as is the "only zeros" behavior for STATUS MSRs. I.e. the intent is to inject a #GP, not exit to userspace due to an unhandled emulation case. Returning '-1' gets interpreted as -EPERM up the stack and effecitvely kills the guest. Fixes: 890ca9aefa78 ("KVM: Add MCE support") Fixes: 9ffd986c6e4e ("KVM: X86: #GP when guest attempts to write MCi_STATUS register w/o 0") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220512222716.4112548-2-seanjc@google.com --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ce0c6fe166d..891ca973c838 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3239,13 +3239,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ if ((offset & 0x3) == 0 && data != 0 && (data | (1 << 10) | 1) != ~(u64)0) - return -1; + return 1; /* MCi_STATUS */ if (!msr_info->host_initiated && (offset & 0x3) == 1 && data != 0) { if (!can_set_mci_status(vcpu)) - return -1; + return 1; } vcpu->arch.mce_banks[offset] = data; -- cgit v1.2.3 From f5223a332f3647a0e3725e9b4a102e9659c84ce4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 May 2022 22:27:15 +0000 Subject: KVM: x86: Use explicit case-statements for MCx banks in {g,s}et_msr_mce() Use an explicit case statement to grab the full range of MCx bank MSRs in {g,s}et_msr_mce(), and manually check only the "end" (the number of banks configured by userspace may be less than the max). The "default" trick works, but is a bit odd now, and will be quite odd if/when support for accessing MCx_CTL2 MSRs is added, which has near identical logic. Hoist "offset" to function scope so as to avoid curly braces for the case statement, and because MCx_CTL2 support will need the same variables. Opportunstically clean up the comment about allowing bit 10 to be cleared from bank 4. No functional change intended. Cc: Jue Wang Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220512222716.4112548-3-seanjc@google.com --- arch/x86/kvm/x86.c | 75 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 891ca973c838..c18d57027838 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3209,6 +3209,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3222,35 +3223,36 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest. - * - * UNIXWARE clears bit 0 of MC1_CTL to ignore - * correctable, single-bit ECC data errors. - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10) | 1) != ~(u64)0) - return 1; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return 1; - } + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); - vcpu->arch.mce_banks[offset] = data; - break; - } + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if ((offset & 0x3) == 0 && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* MCi_STATUS */ + if (!msr_info->host_initiated && (offset & 0x3) == 1 && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; @@ -3819,6 +3821,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3836,16 +3839,16 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; -- cgit v1.2.3 From 54ad60ba9d2673293c72a7c1c4f092622a1f8789 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 May 2022 22:27:16 +0000 Subject: KVM: x86: Add helpers to identify CTL and STATUS MCi MSRs Add helpers to identify CTL (control) and STATUS MCi MSR types instead of open coding the checks using the offset. Using the offset is perfectly safe, but unintuitive, as understanding what the code does requires knowing that the offset calcuation will not affect the lower three bits. Opportunistically comment the STATUS logic to save readers a trip to Intel's SDM or AMD's APM to understand the "data != 0" check. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220512222716.4112548-4-seanjc@google.com --- arch/x86/kvm/x86.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c18d57027838..3b7c9c1b4540 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3191,6 +3191,16 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ @@ -3228,9 +3238,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr > last_msr) return 1; - offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, - last_msr + 1 - MSR_IA32_MC0_CTL); - /* * Only 0 or all 1s can be written to IA32_MCi_CTL, all other * values are architecturally undefined. But, some Linux @@ -3241,15 +3248,21 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, * single-bit ECC data errors. */ - if ((offset & 0x3) == 0 && + if (is_mci_control_msr(msr) && data != 0 && (data | (1 << 10) | 1) != ~(u64)0) return 1; - /* MCi_STATUS */ - if (!msr_info->host_initiated && (offset & 0x3) == 1 && + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && data != 0 && !can_set_mci_status(vcpu)) return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); vcpu->arch.mce_banks[offset] = data; break; default: -- cgit v1.2.3 From 03d84f96890662681feee129cf92491f49247d28 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Jul 2022 15:38:51 -0700 Subject: KVM: x86: Initialize number of APIC LVT entries during APIC creation Initialize the number of LVT entries during APIC creation, else the field will be incorrectly left '0' if userspace never invokes KVM_X86_SETUP_MCE. Add and use a helper to calculate the number of entries even though MCG_CMCI_P is not set by default in vcpu->arch.mcg_cap. Relying on that to always be true is unnecessarily risky, and subtle/confusing as well. Fixes: 4b903561ec49 ("KVM: x86: Add Corrected Machine Check Interrupt (CMCI) emulation to lapic.") Reported-by: Xiaoyao Li Cc: Jue Wang Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6ff17d5a2ae3..1540d01ecb67 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -405,6 +405,11 @@ static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index return apic->nr_lvt_entries > lvt_index; } +static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu) +{ + return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P); +} + void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -2561,6 +2566,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) } apic->vcpu = vcpu; + apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; -- cgit v1.2.3 From f83894b24c2a96fcecdff4858755aa79eb756465 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Jul 2022 15:48:10 -0700 Subject: KVM: x86: Fix handling of APIC LVT updates when userspace changes MCG_CAP Add a helper to update KVM's in-kernel local APIC in response to MCG_CAP being changed by userspace to fix multiple bugs. First and foremost, KVM needs to check that there's an in-kernel APIC prior to dereferencing vcpu->arch.apic. Beyond that, any "new" LVT entries need to be masked, and the APIC version register needs to be updated as it reports out the number of LVT entries. Fixes: 4b903561ec49 ("KVM: x86: Add Corrected Machine Check Interrupt (CMCI) emulation to lapic.") Reported-by: syzbot+8cdad6430c24f396f158@syzkaller.appspotmail.com Cc: Siddh Raman Pant Cc: Jue Wang Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 19 +++++++++++++++++++ arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1540d01ecb67..50354c7a2dc1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -433,6 +433,25 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) kvm_lapic_set_reg(apic, APIC_LVR, v); } +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu) +{ + int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + int i; + + if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries) + return; + + /* Initialize/mask any "new" LVT entries. */ + for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++) + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); + + apic->nr_lvt_entries = nr_lvt_entries; + + /* The number of LVT entries is reflected in the version register. */ + kvm_apic_set_version(vcpu); +} + static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */ [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK, diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 762bf6163798..117a46df5cc1 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -99,6 +99,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fb37d11dec2d..801c3cfd3db5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4893,8 +4893,8 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, if (mcg_cap & MCG_CMCI_P) vcpu->arch.mci_ctl2_banks[bank] = 0; } - vcpu->arch.apic->nr_lvt_entries = - KVM_APIC_MAX_NR_LVT_ENTRIES - !(mcg_cap & MCG_CMCI_P); + + kvm_apic_after_set_mcg_cap(vcpu); static_call(kvm_x86_setup_mce)(vcpu); out: -- cgit v1.2.3 From 159e037d2e36d93a7b066228c6543537c25235c8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 8 Jul 2022 14:51:47 +0200 Subject: KVM: x86: Fully initialize 'struct kvm_lapic_irq' in kvm_pv_kick_cpu_op() 'vector' and 'trig_mode' fields of 'struct kvm_lapic_irq' are left uninitialized in kvm_pv_kick_cpu_op(). While these fields are normally not needed for APIC_DM_REMRD, they're still referenced by __apic_accept_irq() for trace_kvm_apic_accept_irq(). Fully initialize the structure to avoid consuming random stack memory. Fixes: a183b638b61c ("KVM: x86: make apic_accept_irq tracepoint more generic") Reported-by: syzbot+d6caa905917d353f0d07@syzkaller.appspotmail.com Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20220708125147.593975-1-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 801c3cfd3db5..759bcc0a3300 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9340,15 +9340,17 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, */ static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) { - struct kvm_lapic_irq lapic_irq; - - lapic_irq.shorthand = APIC_DEST_NOSHORT; - lapic_irq.dest_mode = APIC_DEST_PHYSICAL; - lapic_irq.level = 0; - lapic_irq.dest_id = apicid; - lapic_irq.msi_redir_hint = false; + /* + * All other fields are unused for APIC_DM_REMRD, but may be consumed by + * common code, e.g. for tracing. Defer initialization to the compiler. + */ + struct kvm_lapic_irq lapic_irq = { + .delivery_mode = APIC_DM_REMRD, + .dest_mode = APIC_DEST_PHYSICAL, + .shorthand = APIC_DEST_NOSHORT, + .dest_id = apicid, + }; - lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } -- cgit v1.2.3 From 156b9d76e8822f2956c15029acf2d4b171502f3a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 12 Jul 2022 15:50:09 +0200 Subject: KVM: nVMX: Always enable TSC scaling for L2 when it was enabled for L1 Windows 10/11 guests with Hyper-V role (WSL2) enabled are observed to hang upon boot or shortly after when a non-default TSC frequency was set for L1. The issue is observed on a host where TSC scaling is supported. The problem appears to be that Windows doesn't use TSC scaling for its guests, even when the feature is advertised, and KVM filters SECONDARY_EXEC_TSC_SCALING out when creating L2 controls from L1's VMCS. This leads to L2 running with the default frequency (matching host's) while L1 is running with an altered one. Keep SECONDARY_EXEC_TSC_SCALING in secondary exec controls for L2 when it was set for L1. TSC_MULTIPLIER is already correctly computed and written by prepare_vmcs02(). Signed-off-by: Vitaly Kuznetsov Fixes: d041b5ea93352b ("KVM: nVMX: Enable nested TSC scaling") Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20220712135009.952805-1-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 778f82015f03..bfa366938c49 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2284,7 +2284,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_DESC); if (nested_cpu_has(vmcs12, -- cgit v1.2.3 From 6e1d2a3f25d518cc3d9703115c6fbec704a6c5bb Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Fri, 1 Jul 2022 17:24:13 +0800 Subject: KVM: x86/mmu: Replace UNMAPPED_GVA with INVALID_GPA for gva_to_gpa() The result of gva_to_gpa() is physical address not virtual address, it is odd that UNMAPPED_GVA macro is used as the result for physical address. Replace UNMAPPED_GVA with INVALID_GPA and drop UNMAPPED_GVA macro. No functional change intended. Signed-off-by: Hou Wenlong Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/6104978956449467d3c68f1ad7f2c2f6d771d0ee.1656667239.git.houwenlong.hwl@antgroup.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu/paging_tmpl.h | 6 +++--- arch/x86/kvm/vmx/sgx.c | 2 +- arch/x86/kvm/x86.c | 18 +++++++++--------- 4 files changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index de5a149d0971..dd6a26f7d46c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -129,7 +129,6 @@ #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) -#define UNMAPPED_GVA (~(gpa_t)0) #define INVALID_GPA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 2448fa8d8438..94bfd5f723ba 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -378,7 +378,7 @@ retry_walk: * information to fix the exit_qualification or exit_info_1 * fields. */ - if (unlikely(real_gpa == UNMAPPED_GVA)) + if (unlikely(real_gpa == INVALID_GPA)) return 0; host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), @@ -431,7 +431,7 @@ retry_walk: #endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; walker->gfn = real_gpa >> PAGE_SHIFT; @@ -962,7 +962,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct x86_exception *exception) { struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; + gpa_t gpa = INVALID_GPA; int r; #ifndef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 35e7ec91ae86..7ae8aa73724c 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -79,7 +79,7 @@ static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, else *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); - if (*gpa == UNMAPPED_GVA) { + if (*gpa == INVALID_GPA) { kvm_inject_emulated_page_fault(vcpu, &ex); return -EFAULT; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 759bcc0a3300..67dcaa670874 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -849,7 +849,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) */ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ @@ -7072,7 +7072,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, offset, toread); @@ -7103,7 +7103,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, /* Inline kvm_read_guest_virt_helper for speed. */ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, exception); - if (unlikely(gpa == UNMAPPED_GVA)) + if (unlikely(gpa == INVALID_GPA)) return X86EMUL_PROPAGATE_FAULT; offset = addr & (PAGE_SIZE-1); @@ -7173,7 +7173,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { @@ -7284,7 +7284,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); - if (*gpa == UNMAPPED_GVA) + if (*gpa == INVALID_GPA) return -1; return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); @@ -7521,7 +7521,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (gpa == UNMAPPED_GVA || + if (gpa == INVALID_GPA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -8338,7 +8338,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * If the mapping is invalid in guest, let cpu retry * it to generate fault. */ - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return true; } @@ -11378,7 +11378,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; + tr->valid = gpa != INVALID_GPA; tr->writeable = 1; tr->usermode = 0; @@ -12983,7 +12983,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) { + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed -- cgit v1.2.3 From 79f772b9e8004c542cc88aaf680189c3f6e9a0f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 22:56:15 +0000 Subject: KVM: x86: Query vcpu->vcpu_idx directly and drop its accessor, again Read vcpu->vcpu_idx directly instead of bouncing through the one-line wrapper, kvm_vcpu_get_idx(), and drop the wrapper. The wrapper is a remnant of the original implementation and serves no purpose; remove it (again) before it gains more users. kvm_vcpu_get_idx() was removed in the not-too-distant past by commit 4eeef2424153 ("KVM: x86: Query vcpu->vcpu_idx directly and drop its accessor"), but was unintentionally re-introduced by commit a54d806688fe ("KVM: Keep memslots in tree-based structures instead of array-based ones"), likely due to a rebase goof. The wrapper then managed to gain users in KVM's Xen code. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220614225615.3843835-1-seanjc@google.com --- arch/x86/kvm/xen.c | 10 +++++----- include/linux/kvm_host.h | 5 ----- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 610beba35907..a0c05ccbf4b1 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1049,7 +1049,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, else vcpu->arch.xen.poll_evtchn = -1; - set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; @@ -1071,7 +1071,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, *r = 0; out: /* Really, this is only needed in case of timeout */ - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (unlikely(sched_poll.nr_ports > 1)) kfree(ports); @@ -1311,7 +1311,7 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) int poll_evtchn = vcpu->arch.xen.poll_evtchn; if ((poll_evtchn == port || poll_evtchn == -1) && - test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) { + test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) { kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } @@ -1344,7 +1344,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id); if (!vcpu) return -EINVAL; - WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu)); + WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); } if (!vcpu->arch.xen.vcpu_info_cache.active) @@ -1540,7 +1540,7 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, */ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); if (vcpu) - e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu); + e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx; else e->xen_evtchn.vcpu_idx = -1; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3b40f8d68fbb..cb8168cc9755 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -907,11 +907,6 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } -static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) -{ - return vcpu->vcpu_idx; -} - void kvm_destroy_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 43bb9e000ea4c62154c01844771fea25b8b83520 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 22:57:53 +0000 Subject: KVM: x86: Tweak name of MONITOR/MWAIT #UD quirk to make it #UD specific Add a "UD" clause to KVM_X86_QUIRK_MWAIT_NEVER_FAULTS to make it clear that the quirk only controls the #UD behavior of MONITOR/MWAIT. KVM doesn't currently enforce fault checks when MONITOR/MWAIT are supported, but that could change in the future. SVM also has a virtualization hole in that it checks all faults before intercepts, and so "never faults" is already a lie when running on SVM. Fixes: bfbcc81bb82c ("KVM: x86: Add a quirk for KVM's "MONITOR/MWAIT are NOPs!" behavior") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220711225753.1073989-4-seanjc@google.com --- Documentation/virt/kvm/api.rst | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/include/uapi/asm/kvm.h | 2 +- arch/x86/kvm/x86.c | 2 +- tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index bafaeedd455c..cd9361f22530 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7523,7 +7523,7 @@ The valid bits in cap.args[0] are: incorrect hypercall instruction will generate a #UD within the guest. -KVM_X86_QUIRK_MWAIT_NEVER_FAULTS By default, KVM emulates MONITOR/MWAIT (if +KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if they are intercepted) as NOPs regardless of whether or not MONITOR/MWAIT are supported according to guest CPUID. When this quirk diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd6a26f7d46c..d4ece7bf2124 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2096,6 +2096,6 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ - KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ee3896416c68..a0c0ab0c898e 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -439,7 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) -#define KVM_X86_QUIRK_MWAIT_NEVER_FAULTS (1 << 6) +#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 67dcaa670874..db46c060acb5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2046,7 +2046,7 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) && + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) return kvm_handle_invalid_op(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 6a4ebcdfa374..094c68d744c0 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -113,7 +113,7 @@ int main(int argc, char *argv[]) disabled_quirks = 0; if (testcase & MWAIT_QUIRK_DISABLED) - disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_FAULTS; + disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS; if (testcase & MISC_ENABLES_QUIRK_DISABLED) disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT; vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks); -- cgit v1.2.3 From ec6e4d863258d4bfb36d48d5e3ef68140234d688 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 23:27:48 +0000 Subject: KVM: x86: Mark TSS busy during LTR emulation _after_ all fault checks Wait to mark the TSS as busy during LTR emulation until after all fault checks for the LTR have passed. Specifically, don't mark the TSS busy if the new TSS base is non-canonical. Opportunistically drop the one-off !seg_desc.PRESENT check for TR as the only reason for the early check was to avoid marking a !PRESENT TSS as busy, i.e. the common !PRESENT is now done before setting the busy bit. Fixes: e37a75a13cda ("KVM: x86: Emulator ignores LDTR/TR extended base on LLDT/LTR") Reported-by: syzbot+760a73552f47a8cd0fd9@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Cc: Tetsuo Handa Cc: Hou Wenlong Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20220711232750.1092012-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 39ea9138224c..09e4b67b881f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1699,16 +1699,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; - if (!seg_desc.p) { - err_vec = NP_VECTOR; - goto exception; - } - old_desc = seg_desc; - seg_desc.type |= 2; /* busy */ - ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, - sizeof(seg_desc), &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) @@ -1749,6 +1739,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ((u64)base3 << 32), ctxt)) return emulate_gp(ctxt, 0); } + + if (seg == VCPU_SREG_TR) { + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; + } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); if (desc) -- cgit v1.2.3 From 2626206963ace9e8bf92b6eea5ff78dd674c555c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 23:27:49 +0000 Subject: KVM: x86: Set error code to segment selector on LLDT/LTR non-canonical #GP When injecting a #GP on LLDT/LTR due to a non-canonical LDT/TSS base, set the error code to the selector. Intel SDM's says nothing about the #GP, but AMD's APM explicitly states that both LLDT and LTR set the error code to the selector, not zero. Note, a non-canonical memory operand on LLDT/LTR does generate a #GP(0), but the KVM code in question is specific to the base from the descriptor. Fixes: e37a75a13cda ("KVM: x86: Emulator ignores LDTR/TR extended base on LLDT/LTR") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20220711232750.1092012-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 09e4b67b881f..bd9e9c5627d0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1736,8 +1736,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | - ((u64)base3 << 32), ctxt)) - return emulate_gp(ctxt, 0); + ((u64)base3 << 32), ctxt)) + return emulate_gp(ctxt, err_code); } if (seg == VCPU_SREG_TR) { -- cgit v1.2.3 From 0bc273266112937cd6560f7d447b6aa9cfd9134a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 23:27:50 +0000 Subject: KVM: x86: WARN only once if KVM leaves a dangling userspace I/O request Change a WARN_ON() to separate WARN_ON_ONCE() if KVM has an outstanding PIO or MMIO request without an associated callback, i.e. if KVM queued a userspace I/O exit but didn't actually exit to userspace before moving on to something else. Warning on every KVM_RUN risks spamming the kernel if KVM gets into a bad state. Opportunistically split the WARNs so that it's easier to triage failures when a WARN fires. Deliberately do not use KVM_BUG_ON(), i.e. don't kill the VM. While the WARN is all but guaranteed to fire if and only if there's a KVM bug, a dangling I/O request does not present a danger to KVM (that flag is truly truly consumed only in a single emulator path), and any such bug is unlikely to be fatal to the VM (KVM essentially failed to do something it shouldn't have tried to do in the first place). In other words, note the bug, but let the VM keep running. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20220711232750.1092012-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index db46c060acb5..0729e434c51b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10849,8 +10849,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = cui(vcpu); if (r <= 0) goto out; - } else - WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); + } else { + WARN_ON_ONCE(vcpu->arch.pio.count); + WARN_ON_ONCE(vcpu->mmio_needed); + } if (kvm_run->immediate_exit) { r = -EINTR; -- cgit v1.2.3 From b184b35d06b2a3de65ff2ef4303f83535572266c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Jul 2022 01:58:38 +0000 Subject: KVM: VMX: Update PT MSR intercepts during filter change iff PT in host+guest Update the Processor Trace (PT) MSR intercepts during a filter change if and only if PT may be exposed to the guest, i.e. only if KVM is operating in the so called "host+guest" mode where PT can be used simultaneously by both the host and guest. If PT is in system mode, the host is the sole owner of PT and the MSRs should never be passed through to the guest. Luckily the missed check only results in unnecessary work, as select RTIT MSRs are passed through only when RTIT tracing is enabled "in" the guest, and tracing can't be enabled in the guest when KVM is in system mode (writes to guest.MSR_IA32_RTIT_CTL are disallowed). Cc: Xiaoyao Li Signed-off-by: Sean Christopherson Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20220712015838.1253995-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c30115b9cb33..3842e121070a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4004,7 +4004,9 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); } - pt_update_intercept_for_msr(vcpu); + /* PT MSRs can be passed through iff PT is exposed to the guest. */ + if (vmx_pt_mode_is_host_guest()) + pt_update_intercept_for_msr(vcpu); } static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 79e48cec6cba4eee0bd3a13f31320e33a1729931 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Jul 2022 02:07:22 +0000 Subject: KVM: x86/mmu: Add optimized helper to retrieve an SPTE's index Add spte_index() to dedup all the code that calculates a SPTE's index into its parent's page table and/or spt array. Opportunistically tweak the calculation to avoid pointer arithmetic, which is subtle (subtract in 8-byte chunks) and less performant (requires the compiler to generate the subtraction). Suggested-by: David Matlack Reviewed-by: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20220712020724.1262121-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 22 ++++++++++------------ arch/x86/kvm/mmu/paging_tmpl.h | 4 ++-- arch/x86/kvm/mmu/spte.h | 6 ++++++ 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f7fa4c31b7c5..864a32f96082 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1038,7 +1038,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); - gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); /* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU @@ -1589,7 +1589,7 @@ static void __rmap_add(struct kvm *kvm, int rmap_count; sp = sptep_to_sp(spte); - kvm_mmu_page_set_translation(sp, spte - sp->spt, gfn, access); + kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); @@ -1716,11 +1716,9 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) static void mark_unsync(u64 *spte) { struct kvm_mmu_page *sp; - unsigned int index; sp = sptep_to_sp(spte); - index = spte - sp->spt; - if (__test_and_set_bit(index, sp->unsync_child_bitmap)) + if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) return; if (sp->unsync_children++) return; @@ -2203,7 +2201,7 @@ static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsig */ if (role.has_4_byte_gpte) { WARN_ON_ONCE(role.level != PG_LEVEL_4K); - role.quadrant = (sptep - parent_sp->spt) % 2; + role.quadrant = spte_index(sptep) & 1; } return role; @@ -2828,7 +2826,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, rmap_add(vcpu, slot, sptep, gfn, pte_access); } else { /* Already rmapped but the pte_access bits may have changed. */ - kvm_mmu_page_set_access(sp, sptep - sp->spt, pte_access); + kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); } return ret; @@ -2844,7 +2842,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, int i, ret; gfn_t gfn; - gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) return -1; @@ -2870,7 +2868,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, WARN_ON(!sp->role.direct); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -6156,8 +6154,8 @@ static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu unsigned int access; gfn_t gfn; - gfn = kvm_mmu_page_get_gfn(huge_sp, huge_sptep - huge_sp->spt); - access = kvm_mmu_page_get_access(huge_sp, huge_sptep - huge_sp->spt); + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); /* * Note, huge page splitting always uses direct shadow pages, regardless @@ -6231,7 +6229,7 @@ static int shadow_mmu_try_split_huge_page(struct kvm *kvm, u64 spte; /* Grab information for the tracepoint before dropping the MMU lock. */ - gfn = kvm_mmu_page_get_gfn(huge_sp, huge_sptep - huge_sp->spt); + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); level = huge_sp->role.level; spte = *huge_sptep; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 94bfd5f723ba..f5958071220c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -595,7 +595,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -933,7 +933,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) break; pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + pte_gpa += spte_index(sptep) * sizeof(pt_element_t); mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); if (is_shadow_present_pte(old_spte)) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index b5c855f5514f..ba3dccb202bc 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -190,6 +190,12 @@ static inline bool is_removed_spte(u64 spte) return spte == REMOVED_SPTE; } +/* Get an SPTE's index into its parent's page table (and the spt array). */ +static inline int spte_index(u64 *sptep) +{ + return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); +} + /* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to -- cgit v1.2.3 From 39944ab99c2f9cc54272b9b871e3e759ebaa960b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Jul 2022 02:07:23 +0000 Subject: KVM: x86/mmu: Expand quadrant comment for PG_LEVEL_4K shadow pages Tweak the comment above the computation of the quadrant for PG_LEVEL_4K shadow pages to explicitly call out how and why KVM uses role.quadrant to consume gPTE bits. Opportunistically wrap an unnecessarily long line. No functional change intended. Link: https://lore.kernel.org/all/YqvWvBv27fYzOFdE@google.com Reviewed-by: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20220712020724.1262121-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 864a32f96082..7a65e57b9b41 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2168,7 +2168,8 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); } -static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsigned int access) +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, + unsigned int access) { struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); union kvm_mmu_page_role role; @@ -2195,9 +2196,15 @@ static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsig * uses 2 PAE page tables, each mapping a 2MiB region. For these, * @role.quadrant encodes which half of the region they map. * - * Note, the 4 PAE page directories are pre-allocated and the quadrant - * assigned in mmu_alloc_root(). So only page tables need to be handled - * here. + * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE + * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow + * PDPTEs; those 4 PAE page directories are pre-allocated and their + * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes + * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume + * bit 21 in the PTE (the child here), KVM propagates that bit to the + * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE + * covers bit 21 (see above), thus the quadrant is calculated from the + * _least_ significant bit of the PDE index. */ if (role.has_4_byte_gpte) { WARN_ON_ONCE(role.level != PG_LEVEL_4K); -- cgit v1.2.3 From dfd4eb444e5cef4387605d1eef1866ceea0544ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Jul 2022 02:07:24 +0000 Subject: KVM: x86/mmu: Fix typo and tweak comment for split_desc_cache capacity Remove a spurious closing paranthesis and tweak the comment about the cache capacity for PTE descriptors (rmaps) eager page splitting to tone down the assertion slightly, and to call out that topup requires dropping mmu_lock, which is the real motivation for avoiding topup (as opposed to memory usage). Cc: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20220712020724.1262121-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7a65e57b9b41..52664c3caaab 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6125,14 +6125,15 @@ static int topup_split_caches(struct kvm *kvm) { /* * Allocating rmap list entries when splitting huge pages for nested - * MMUs is uncommon as KVM needs to allocate if and only if there is + * MMUs is uncommon as KVM needs to use a list if and only if there is * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be - * aliased by multiple L2 gfns. Aliasing gfns when using TDP is very - * atypical for VMMs; a few gfns are often aliased during boot, e.g. - * when remapping firmware, but aliasing rarely occurs post-boot). If - * there is only one rmap entry, rmap->val points directly at that one - * entry and doesn't need to allocate a list. Buffer the cache by the - * default capacity so that KVM doesn't have to topup the cache if it + * aliased by multiple L2 gfns and/or from multiple nested roots with + * different roles. Aliasing gfns when using TDP is atypical for VMMs; + * a few gfns are often aliased during boot, e.g. when remapping BIOS, + * but aliasing rarely occurs post-boot or for many gfns. If there is + * only one rmap entry, rmap->val points directly at that one entry and + * doesn't need to allocate a list. Buffer the cache by the default + * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM * encounters an aliased gfn or two. */ const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + -- cgit v1.2.3 From bdc2d7ad10724de31463a0cdbd88b0ad7353a6de Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 14 Jul 2022 15:44:53 +0300 Subject: KVM: SVM: fix task switch emulation on INTn instruction. Recently KVM's SVM code switched to re-injecting software interrupt events, if something prevented their delivery. Task switch due to task gate in the IDT, however is an exception to this rule, because in this case, INTn instruction causes a task switch intercept and its emulation completes the INTn emulation as well. Add a missing case to task_switch_interception for that. This fixes 32 bit kvm unit test taskswitch2. Fixes: 7e5b5ef8dca322 ("KVM: SVM: Re-inject INTn instead of retrying the insn on "failure"") Signed-off-by: Maxim Levitsky Reviewed-by: Maciej S. Szmigiero Message-Id: <20220714124453.188655-1-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 37ce061dfc76..1d8eb3333f46 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2446,6 +2446,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) kvm_clear_exception_queue(vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: + case SVM_EXITINTINFO_TYPE_SOFT: kvm_clear_interrupt_queue(vcpu); break; default: -- cgit v1.2.3 From 277ad7d58611b455662f2e3f7bd24ce5bfeb2fdc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Jul 2022 02:06:45 +0200 Subject: KVM: x86: Add dedicated helper to get CPUID entry with significant index Add a second CPUID helper, kvm_find_cpuid_entry_index(), to handle KVM queries for CPUID leaves whose index _may_ be significant, and drop the index param from the existing kvm_find_cpuid_entry(). Add a WARN in the inner helper, cpuid_entry2_find(), to detect attempts to retrieve a CPUID entry whose index is significant without explicitly providing an index. Using an explicit magic number and letting callers omit the index avoids confusion by eliminating the myriad cases where KVM specifies '0' as a dummy value. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 80 +++++++++++++++++++++++++++++++++----------- arch/x86/kvm/cpuid.h | 16 +++++---- arch/x86/kvm/hyperv.c | 8 ++--- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 +-- arch/x86/kvm/vmx/sgx.c | 8 ++--- arch/x86/kvm/vmx/vmx.c | 6 ++-- arch/x86/kvm/x86.c | 2 +- 8 files changed, 84 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d47222ab8e6e..75dcf7a72605 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -67,9 +67,17 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() + * to avoid false positives when processing guest CPUID input. + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( - struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -77,9 +85,31 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( for (i = 0; i < nent; i++) { e = &entries[i]; - if (e->function == function && - (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)) + if (e->function != function) + continue; + + /* + * If the index isn't significant, use the first entry with a + * matching function. It's userspace's responsibilty to not + * provide "duplicate" entries in all cases. + */ + if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) + return e; + + + /* + * Similarly, use the first matching entry if KVM is doing a + * lookup (as opposed to emulating CPUID) for a function that's + * architecturally defined as not having a significant index. + */ + if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { + /* + * Direct lookups from KVM should not diverge from what + * KVM defines internally (the architectural behavior). + */ + WARN_ON_ONCE(cpuid_function_is_indexed(function)); return e; + } } return NULL; @@ -96,7 +126,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -151,7 +182,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) vcpu->arch.kvm_cpuid_base = 0; for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function, 0); + entry = kvm_find_cpuid_entry(vcpu, function); if (entry) { u32 signature[3]; @@ -177,7 +208,8 @@ static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *v if (!base) return NULL; - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) @@ -219,7 +251,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); - best = cpuid_entry2_find(entries, nent, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -250,7 +282,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = cpuid_entry2_find(entries, nent, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & @@ -285,7 +317,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -325,10 +357,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: @@ -1302,12 +1334,20 @@ out_free: return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, index); } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* @@ -1344,7 +1384,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; - basic = kvm_find_cpuid_entry(vcpu, 0, 0); + basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; @@ -1353,11 +1393,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) - class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) - class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); + class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else - class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; @@ -1375,7 +1415,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ - return kvm_find_cpuid_entry(vcpu, basic->eax, index); + return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, @@ -1385,7 +1425,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; - entry = kvm_find_cpuid_entry(vcpu, function, index); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { @@ -1414,7 +1454,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { - entry = kvm_find_cpuid_entry(vcpu, function, 1); + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index ac72aabba981..b1658c0de847 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -13,8 +13,10 @@ void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); + u32 function); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -76,7 +78,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; - entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); + entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; @@ -109,7 +111,7 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); @@ -119,7 +121,7 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } @@ -127,7 +129,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; @@ -138,7 +140,7 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; @@ -154,7 +156,7 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e2e95a6fccfd..ed804447589c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1992,7 +1992,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; struct kvm_vcpu_hv *hv_vcpu; - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { vcpu->arch.hyperv_enabled = true; } else { @@ -2005,7 +2005,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu = to_hv_vcpu(vcpu); - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; @@ -2016,7 +2016,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.features_edx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; @@ -2025,7 +2025,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.enlightenments_ebx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; else diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1d8eb3333f46..84414eafcd0d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4194,7 +4194,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 53ccba896e77..4bc098fbec31 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -531,7 +531,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + entry = kvm_find_cpuid_entry(vcpu, 0xa); if (!entry || !vcpu->kvm->arch.enable_pmu) return; eax.full = entry->eax; @@ -577,7 +577,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry(vcpu, 7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 7ae8aa73724c..aba8cebdc587 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -148,8 +148,8 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, u8 max_size_log2; int trapnr, ret; - sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); - sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { kvm_prepare_emulation_failure_exit(vcpu); return 0; @@ -431,7 +431,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (!vcpu->kvm->arch.sgx_provisioning_allowed) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); if (!guest_cpuid) return true; @@ -439,7 +439,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!guest_cpuid) return true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3842e121070a..e6ab2c2c4d3b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7430,7 +7430,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + entry = kvm_find_cpuid_entry(vcpu, 0x1); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); @@ -7446,7 +7446,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); @@ -7481,7 +7481,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { - best = kvm_find_cpuid_entry(vcpu, 0x14, i); + best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0729e434c51b..f389691d8c04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11735,7 +11735,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry * on RESET. But, go through the motions in case that's ever remedied. */ - cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); static_call(kvm_x86_vcpu_reset)(vcpu, init_event); -- cgit v1.2.3 From ba28401bb93e5c779d5762c4eb0eb665493e5dd4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Jul 2022 15:37:07 +0000 Subject: KVM: x86: Restrict get_mt_mask() to a u8, use KVM_X86_OP_OPTIONAL_RET0 Restrict get_mt_mask() to a u8 and reintroduce using a RET0 static_call for the SVM implementation. EPT stores the memtype information in the lower 8 bits (bits 6:3 to be precise), and even returns a shifted u8 without an explicit cast to a larger type; there's no need to return a full u64. Note, RET0 doesn't play nice with a u64 return on 32-bit kernels, see commit bf07be36cd88 ("KVM: x86: do not use KVM_X86_OP_OPTIONAL_RET0 for get_mt_mask"). Signed-off-by: Sean Christopherson Message-Id: <20220714153707.3239119-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 6 ------ arch/x86/kvm/vmx/vmx.c | 2 +- 4 files changed, 3 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 6f2f1affbb78..51f777071584 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -88,7 +88,7 @@ KVM_X86_OP(deliver_interrupt) KVM_X86_OP_OPTIONAL(sync_pir_to_irr) KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d4ece7bf2124..e8281d64a431 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1545,7 +1545,7 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 84414eafcd0d..ba81a7e58f75 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4159,11 +4159,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4815,7 +4810,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, - .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e6ab2c2c4d3b..b0cc911a8f6f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7354,7 +7354,7 @@ static int __init vmx_check_processor_compat(void) return 0; } -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; -- cgit v1.2.3 From 8031d87aa9953ddeb047a5356ebd0b240c30f233 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Jul 2022 11:46:53 -0700 Subject: KVM: x86: Check target, not vCPU's x2APIC ID, when applying hotplug hack When applying the hotplug hack to match x2APIC IDs for vCPUs in xAPIC mode, check the target APID ID for being unaddressable in xAPIC mode instead of checking the vCPU's x2APIC ID, and in that case proceed as if apic_x2apic_mode(vcpu) were true. Functionally, it does not matter whether you compare kvm_x2apic_id(apic) or mda with 0xff, since the two values are then checked for equality. But in isolation, checking the x2APIC ID takes an unnecessary dependency on the x2APIC ID being read-only (which isn't strictly true on AMD CPUs, and is difficult to document as well); it also requires KVM to fallthrough and check the xAPIC ID as well to deal with a writable xAPIC ID, whereas the xAPIC ID _can't_ match a target ID greater than 0xff. Opportunistically reword the comment to call out the various subtleties, and to fix a typo reported by Zhang Jiaming. No functional change intended. Cc: Zhang Jiaming Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 50354c7a2dc1..9d4f73c4dc02 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -826,17 +826,17 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) if (kvm_apic_broadcast(apic, mda)) return true; - if (apic_x2apic_mode(apic)) - return mda == kvm_x2apic_id(apic); - /* - * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if - * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and - * this allows unique addressing of VCPUs with APIC ID over 0xff. - * The 0xff condition is needed because writeable xAPIC ID. + * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they + * were in x2APIC mode if the target APIC ID can't be encoded as an + * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which + * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC + * mode. Match the x2APIC ID if and only if the target APIC ID can't + * be encoded in xAPIC to avoid spurious matches against a vCPU that + * changed its (addressable) xAPIC ID (which is writable). */ - if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) - return true; + if (apic_x2apic_mode(apic) || mda > 0xff) + return mda == kvm_x2apic_id(apic); return mda == kvm_xapic_id(apic); } -- cgit v1.2.3 From da0b93d65e5bba6d4381f9dc99c547b60582e7a7 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 18 Jul 2022 17:47:13 +0200 Subject: KVM: nSVM: Pull CS.Base from actual VMCB12 for soft int/ex re-injection enter_svm_guest_mode() first calls nested_vmcb02_prepare_control() to copy control fields from VMCB12 to the current VMCB, then nested_vmcb02_prepare_save() to perform a similar copy of the save area. This means that nested_vmcb02_prepare_control() still runs with the previous save area values in the current VMCB so it shouldn't take the L2 guest CS.Base from this area. Explicitly pull CS.Base from the actual VMCB12 instead in enter_svm_guest_mode(). Granted, having a non-zero CS.Base is a very rare thing (and even impossible in 64-bit mode), having it change between nested VMRUNs is probably even rarer, but if it happens it would create a really subtle bug so it's better to fix it upfront. Fixes: 6ef88d6e36c2 ("KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction") Signed-off-by: Maciej S. Szmigiero Message-Id: <4caa0f67589ae3c22c311ee0e6139496902f2edc.1658159083.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index adf4120b05d9..23252ab82194 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -639,7 +639,8 @@ static bool is_evtinj_nmi(u32 evtinj) } static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, - unsigned long vmcb12_rip) + unsigned long vmcb12_rip, + unsigned long vmcb12_csbase) { u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; @@ -711,7 +712,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); if (is_evtinj_soft(vmcb02->control.event_inj)) { svm->soft_int_injected = true; - svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_csbase = vmcb12_csbase; svm->soft_int_old_rip = vmcb12_rip; if (svm->nrips_enabled) svm->soft_int_next_rip = svm->nested.ctl.next_rip; @@ -800,7 +801,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, vmcb12->save.rip); + nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, @@ -1663,7 +1664,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip); + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); /* * While the nested guest CR3 is already checked and set by -- cgit v1.2.3 From 35d539c3e44f2021ff47a91cf4e6a35c550b6fbc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 22:42:20 +0000 Subject: KVM: x86/mmu: Return a u64 (the old SPTE) from mmu_spte_clear_track_bits() Return a u64, not an int, from mmu_spte_clear_track_bits(). The return value is the old SPTE value, which is very much a 64-bit value. The sole caller that consumes the return value, drop_spte(), already uses a u64. The only reason that truncating the SPTE value is not problematic is because drop_spte() only queries the shadow-present bit, which is in the lower 32 bits. Signed-off-by: Sean Christopherson Message-Id: <20220715224226.3749507-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 52664c3caaab..b2379ede2ed6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -529,7 +529,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) +static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; -- cgit v1.2.3 From a42989e7fbb0186d9fee05b29e0ea9cb639d0bd3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 22:42:21 +0000 Subject: KVM: x86/mmu: Directly "destroy" PTE list when recycling rmaps Use pte_list_destroy() directly when recycling rmaps instead of bouncing through kvm_unmap_rmapp() and kvm_zap_rmapp(). Calling kvm_unmap_rmapp() is unnecessary and odd as it requires passing dummy parameters; passing NULL for @slot when __rmap_add() already has a valid slot is especially weird and confusing. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220715224226.3749507-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b2379ede2ed6..92fcffec0227 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1596,7 +1596,7 @@ static void __rmap_add(struct kvm *kvm, rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > RMAP_RECYCLE_THRESHOLD) { - kvm_unmap_rmapp(kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + pte_list_destroy(kvm, rmap_head); kvm_flush_remote_tlbs_with_address( kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } -- cgit v1.2.3 From aed02fe3cae41c4f9a5f31390b198025bfc9cf88 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 22:42:22 +0000 Subject: KVM: x86/mmu: Drop the "p is for pointer" from rmap helpers Drop the trailing "p" from rmap helpers, i.e. rename functions to simply be kvm__rmap(). Declaring that a function takes a pointer is completely unnecessary and goes against kernel style. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220715224226.3749507-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 92fcffec0227..fec999d2fc13 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -403,7 +403,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * - * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * An spte tlb flush may be pending, because kvm_set_pte_rmap * coalesces them and we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * @@ -1383,22 +1383,22 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) +static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) { return pte_list_destroy(kvm, rmap_head); } -static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_unmap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { - return kvm_zap_rmapp(kvm, rmap_head, slot); + return kvm_zap_rmap(kvm, rmap_head, slot); } -static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t pte) +static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t pte) { u64 *sptep; struct rmap_iterator iter; @@ -1529,7 +1529,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmap); if (is_tdp_mmu_enabled(kvm)) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1542,7 +1542,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); @@ -1550,9 +1550,9 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return flush; } -static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1564,9 +1564,9 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return young; } -static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t unused) +static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1615,7 +1615,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1628,7 +1628,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -6004,8 +6004,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (WARN_ON_ONCE(start >= end)) continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - + flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } -- cgit v1.2.3 From 2833eda0e296ba3c410e9d57636de14d2589ad4e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 22:42:23 +0000 Subject: KVM: x86/mmu: Rename __kvm_zap_rmaps() to align with other nomenclature Rename __kvm_zap_rmaps() to kvm_rmap_zap_gfn_range() to avoid future confusion with a soon-to-be-introduced __kvm_zap_rmap(). Using a plural "rmaps" is somewhat ambiguous without additional context, as it's not obvious whether it's referring to multiple rmap lists, versus multiple rmap entries within a single list. Use kvm_rmap_zap_gfn_range() to align with the pattern established by kvm_rmap_zap_collapsible_sptes(), without losing the information that it zaps only rmap-based MMUs, i.e. don't rename it to __kvm_zap_gfn_range(). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220715224226.3749507-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fec999d2fc13..61c32d8d1f6d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5982,7 +5982,7 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) mmu_free_vm_memory_caches(kvm); } -static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; @@ -6029,7 +6029,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_inc_notifier_count(kvm, gfn_start, gfn_end); - flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); + flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) -- cgit v1.2.3 From f8480721a74b82c6e88fff09286aa452d5ed6d71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 22:42:24 +0000 Subject: KVM: x86/mmu: Rename rmap zap helpers to eliminate "unmap" wrapper Rename kvm_unmap_rmap() and kvm_zap_rmap() to kvm_zap_rmap() and __kvm_zap_rmap() respectively to show that what was the "unmap" helper is just a wrapper for the "zap" helper, i.e. that they do the exact same thing, one just exists to deal with its caller passing in more params. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220715224226.3749507-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 61c32d8d1f6d..00be88e0a5f7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1383,17 +1383,17 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) +static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) { return pte_list_destroy(kvm, rmap_head); } -static bool kvm_unmap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { - return kvm_zap_rmap(kvm, rmap_head, slot); + return __kvm_zap_rmap(kvm, rmap_head, slot); } static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1529,7 +1529,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmap); + flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); if (is_tdp_mmu_enabled(kvm)) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -6004,7 +6004,7 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e if (WARN_ON_ONCE(start >= end)) continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmap, + flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } -- cgit v1.2.3 From 9202aee816c84d69179f94193c5dd321bb0e8530 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 22:42:25 +0000 Subject: KVM: x86/mmu: Rename pte_list_{destroy,remove}() to show they zap SPTEs Rename pte_list_remove() and pte_list_destroy() to kvm_zap_one_rmap_spte() and kvm_zap_all_rmap_sptes() respectively to document that (a) they zap SPTEs and (b) to better document how they differ (remove vs. destroy does not exactly scream "one vs. all"). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220715224226.3749507-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 00be88e0a5f7..282e7e2ab446 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -957,15 +957,16 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - u64 *sptep) +static void kvm_zap_one_rmap_spte(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); __pte_list_remove(sptep, rmap_head); } -/* Return true if rmap existed, false otherwise */ -static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +/* Return true if at least one SPTE was zapped, false otherwise */ +static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; @@ -1386,7 +1387,7 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { - return pte_list_destroy(kvm, rmap_head); + return kvm_zap_all_rmap_sptes(kvm, rmap_head); } static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1417,7 +1418,7 @@ restart: need_flush = true; if (pte_write(pte)) { - pte_list_remove(kvm, rmap_head, sptep); + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( @@ -1596,7 +1597,7 @@ static void __rmap_add(struct kvm *kvm, rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > RMAP_RECYCLE_THRESHOLD) { - pte_list_destroy(kvm, rmap_head); + kvm_zap_all_rmap_sptes(kvm, rmap_head); kvm_flush_remote_tlbs_with_address( kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } @@ -6406,7 +6407,7 @@ restart: if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, pfn, PG_LEVEL_NUM)) { - pte_list_remove(kvm, rmap_head, sptep); + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, -- cgit v1.2.3 From 3c2e10373ec73cdd78411a28a14a9abb9da1bef6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 22:42:26 +0000 Subject: KVM: x86/mmu: Remove underscores from __pte_list_remove() Remove the underscores from __pte_list_remove(), the function formerly known as pte_list_remove() is now named kvm_zap_one_rmap_spte() to show that it zaps rmaps/PTEs, i.e. doesn't just remove an entry from a list. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220715224226.3749507-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 282e7e2ab446..5957c3e66b77 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -921,7 +921,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(desc); } -static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; @@ -961,7 +961,7 @@ static void kvm_zap_one_rmap_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); - __pte_list_remove(sptep, rmap_head); + pte_list_remove(sptep, rmap_head); } /* Return true if at least one SPTE was zapped, false otherwise */ @@ -1051,7 +1051,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - __pte_list_remove(spte, rmap_head); + pte_list_remove(spte, rmap_head); } /* @@ -1693,7 +1693,7 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - __pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm_mmu_page *sp, -- cgit v1.2.3 From 01e69cef63f88d809181f62f03a01d7295f2d5a4 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Jul 2022 03:38:33 -0500 Subject: KVM: SVM: Fix x2APIC MSRs interception The index for svm_direct_access_msrs was incorrectly initialized with the APIC MMIO register macros. Fix by introducing a macro for calculating x2APIC MSRs. Fixes: 5c127c85472c ("KVM: SVM: Adding support for configuring x2APIC MSRs interception") Cc: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220718083833.222117-1-suravee.suthikulpanit@amd.com> Reviewed-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 52 ++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ba81a7e58f75..aef63aae922d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -74,6 +74,8 @@ static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) + static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is initially cleared */ @@ -100,31 +102,31 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, { .index = MSR_TSC_AUX, .always = false }, - { .index = (APIC_BASE_MSR + APIC_ID), .always = false }, - { .index = (APIC_BASE_MSR + APIC_LVR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_TASKPRI), .always = false }, - { .index = (APIC_BASE_MSR + APIC_ARBPRI), .always = false }, - { .index = (APIC_BASE_MSR + APIC_PROCPRI), .always = false }, - { .index = (APIC_BASE_MSR + APIC_EOI), .always = false }, - { .index = (APIC_BASE_MSR + APIC_RRR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_LDR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_DFR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_SPIV), .always = false }, - { .index = (APIC_BASE_MSR + APIC_ISR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_TMR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_IRR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_ESR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_ICR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_ICR2), .always = false }, - { .index = (APIC_BASE_MSR + APIC_LVTT), .always = false }, - { .index = (APIC_BASE_MSR + APIC_LVTTHMR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_LVTPC), .always = false }, - { .index = (APIC_BASE_MSR + APIC_LVT0), .always = false }, - { .index = (APIC_BASE_MSR + APIC_LVT1), .always = false }, - { .index = (APIC_BASE_MSR + APIC_LVTERR), .always = false }, - { .index = (APIC_BASE_MSR + APIC_TMICT), .always = false }, - { .index = (APIC_BASE_MSR + APIC_TMCCT), .always = false }, - { .index = (APIC_BASE_MSR + APIC_TDCR), .always = false }, + { .index = X2APIC_MSR(APIC_ID), .always = false }, + { .index = X2APIC_MSR(APIC_LVR), .always = false }, + { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, + { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, + { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, + { .index = X2APIC_MSR(APIC_EOI), .always = false }, + { .index = X2APIC_MSR(APIC_RRR), .always = false }, + { .index = X2APIC_MSR(APIC_LDR), .always = false }, + { .index = X2APIC_MSR(APIC_DFR), .always = false }, + { .index = X2APIC_MSR(APIC_SPIV), .always = false }, + { .index = X2APIC_MSR(APIC_ISR), .always = false }, + { .index = X2APIC_MSR(APIC_TMR), .always = false }, + { .index = X2APIC_MSR(APIC_IRR), .always = false }, + { .index = X2APIC_MSR(APIC_ESR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR2), .always = false }, + { .index = X2APIC_MSR(APIC_LVTT), .always = false }, + { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, + { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, + { .index = X2APIC_MSR(APIC_LVT0), .always = false }, + { .index = X2APIC_MSR(APIC_LVT1), .always = false }, + { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, + { .index = X2APIC_MSR(APIC_TMICT), .always = false }, + { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, + { .index = X2APIC_MSR(APIC_TDCR), .always = false }, { .index = MSR_INVALID, .always = false }, }; -- cgit v1.2.3 From 94bda2f4cd868046094351b06b87d7d1053e990d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 23:00:13 +0000 Subject: KVM: x86: Reject loading KVM if host.PAT[0] != WB Reject KVM if entry '0' in the host's IA32_PAT MSR is not programmed to writeback (WB) memtype. KVM subtly relies on IA32_PAT entry '0' to be programmed to WB by leaving the PAT bits in shadow paging and NPT SPTEs as '0'. If something other than WB is in PAT[0], at _best_ guests will suffer very poor performance, and at worst KVM will crash the system by breaking cache-coherency expecations (e.g. using WC for guest memory). Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220715230016.3762909-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f389691d8c04..12199c40f2bc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9141,6 +9141,7 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { struct kvm_x86_init_ops *ops = opaque; + u64 host_pat; int r; if (kvm_x86_ops.hardware_enable) { @@ -9179,6 +9180,20 @@ int kvm_arch_init(void *opaque) goto out; } + /* + * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes + * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something + * other than WB. Note, EPT doesn't utilize the PAT, but don't bother + * with an exception. PAT[0] is set to WB on RESET and also by the + * kernel, i.e. failure indicates a kernel bug or broken firmware. + */ + if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + (host_pat & GENMASK(2, 0)) != 6) { + pr_err("kvm: host PAT[0] is not WB\n"); + r = -EIO; + goto out; + } + r = -ENOMEM; x86_emulator_cache = kvm_alloc_emulator_cache(); -- cgit v1.2.3 From 82ffad2ddf5d7b5b9c14c705fed4a7d5f2ec4c38 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 23:00:14 +0000 Subject: KVM: x86: Drop unnecessary goto+label in kvm_arch_init() Return directly if kvm_arch_init() detects an error before doing any real work, jumping through a label obfuscates what's happening and carries the unnecessary risk of leaving 'r' uninitialized. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220715230016.3762909-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12199c40f2bc..41aa3137665c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9146,21 +9146,18 @@ int kvm_arch_init(void *opaque) if (kvm_x86_ops.hardware_enable) { pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); - r = -EEXIST; - goto out; + return -EEXIST; } if (!ops->cpu_has_kvm_support()) { pr_err_ratelimited("kvm: no hardware support for '%s'\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (ops->disabled_by_bios()) { pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } /* @@ -9170,14 +9167,12 @@ int kvm_arch_init(void *opaque) */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { printk(KERN_ERR "kvm: inadequate fpu\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } /* @@ -9190,21 +9185,19 @@ int kvm_arch_init(void *opaque) if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { pr_err("kvm: host PAT[0] is not WB\n"); - r = -EIO; - goto out; + return -EIO; } - r = -ENOMEM; - x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out; + return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + r = -ENOMEM; goto out_free_x86_emulator_cache; } kvm_nr_uret_msrs = 0; @@ -9235,7 +9228,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out: return r; } -- cgit v1.2.3 From 38bf9d7bf277bb40c4dceedd2b5eb87d02c36d5b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 23:00:15 +0000 Subject: KVM: x86/mmu: Add shadow mask for effective host MTRR memtype Add shadow_memtype_mask to capture that EPT needs a non-zero memtype mask instead of relying on TDP being enabled, as NPT doesn't need a non-zero mask. This is a glorified nop as kvm_x86_ops.get_mt_mask() returns zero for NPT anyways. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220715230016.3762909-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/spte.c | 21 ++++++++++++++++++--- arch/x86/kvm/mmu/spte.h | 1 + 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index fb1f17504138..7314d27d57a4 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -161,10 +162,10 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + if (shadow_memtype_mask) + spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -391,6 +392,13 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + /* + * EPT overrides the host MTRRs, and so KVM must program the desired + * memtype directly into the SPTEs. Note, this mask is just the mask + * of all bits that factor into the memtype, the actual memtype must be + * dynamically calculated, e.g. to ensure host MMIO is mapped UC. + */ + shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -441,6 +449,13 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nx_mask = PT64_NX_MASK; shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; + + /* + * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB + * memtype in the SPTEs, i.e. relies on host MTRRs to provide the + * correct memtype (WB is the "weakest" memtype). + */ + shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index ba3dccb202bc..cabe3fbb4f39 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -147,6 +147,7 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; -- cgit v1.2.3 From d5e90a699875ce552d0058d05b9b4b458b46fa6a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 23:00:16 +0000 Subject: KVM: x86/mmu: Restrict mapping level based on guest MTRR iff they're used Restrict the mapping level for SPTEs based on the guest MTRRs if and only if KVM may actually use the guest MTRRs to compute the "real" memtype. For all forms of paging, guest MTRRs are purely virtual in the sense that they are completely ignored by hardware, i.e. they affect the memtype only if software manually consumes them. The only scenario where KVM consumes the guest MTRRs is when shadow_memtype_mask is non-zero and the guest has non-coherent DMA, in all other cases KVM simply leaves the PAT field in SPTEs as '0' to encode WB memtype. Note, KVM may still ultimately ignore guest MTRRs, e.g. if the backing pfn is host MMIO, but false positives are ok as they only cause a slight performance blip (unless the guest is doing weird things with its MTRRs, which is extremely unlikely). Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220715230016.3762909-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5957c3e66b77..7c0b10aac3ed 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4296,14 +4296,26 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault); int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - while (fault->max_level > PG_LEVEL_4K) { - int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - - if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) - break; + /* + * If the guest's MTRRs may be used to compute the "real" memtype, + * restrict the mapping level to ensure KVM uses a consistent memtype + * across the entire mapping. If the host MTRRs are ignored by TDP + * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA + * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype + * from the guest's MTRRs so that guest accesses to memory that is + * DMA'd aren't cached against the guest's wishes. + * + * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, + * e.g. KVM will force UC memtype for host MMIO. + */ + if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { + int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); + gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - --fault->max_level; + if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) + break; + } } return direct_page_fault(vcpu, fault); -- cgit v1.2.3 From a8ac499bb6abbd55fe60f1dc2d053f4b5b13aa73 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 23:21:04 +0000 Subject: KVM: x86/mmu: Don't require refcounted "struct page" to create huge SPTEs Drop the requirement that a pfn be backed by a refcounted, compound or or ZONE_DEVICE, struct page, and instead rely solely on the host page tables to identify huge pages. The PageCompound() check is a remnant of an old implementation that identified (well, attempt to identify) huge pages without walking the host page tables. The ZONE_DEVICE check was added as an exception to the PageCompound() requirement. In other words, neither check is actually a hard requirement, if the primary has a pfn backed with a huge page, then KVM can back the pfn with a huge page regardless of the backing store. Dropping the @pfn parameter will also allow KVM to query the max host mapping level without having to first get the pfn, which is advantageous for use outside of the page fault path where KVM wants to take action if and only if a page can be mapped huge, i.e. avoids the pfn lookup for gfns that can't be backed with a huge page. Cc: Mingwei Zhang Signed-off-by: Sean Christopherson Reviewed-by: Mingwei Zhang Message-Id: <20220715232107.3775620-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 23 +++++------------------ arch/x86/kvm/mmu/mmu_internal.h | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 8 +------- 3 files changed, 7 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7c0b10aac3ed..091278a8bd56 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2920,11 +2920,10 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { int level = PG_LEVEL_4K; - struct page *page; unsigned long hva; unsigned long flags; pgd_t pgd; @@ -2932,17 +2931,6 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, pud_t pud; pmd_t pmd; - /* - * Note, @slot must be non-NULL, i.e. the caller is responsible for - * ensuring @pfn isn't garbage and is backed by a memslot. - */ - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return PG_LEVEL_4K; - - if (!PageCompound(page) && !kvm_is_zone_device_page(page)) - return PG_LEVEL_4K; - /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the @@ -2995,7 +2983,7 @@ out: int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level) + int max_level) { struct kvm_lpage_info *linfo; int host_level; @@ -3010,7 +2998,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } @@ -3035,8 +3023,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * level, which will be used to do precise, accurate accounting. */ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->pfn, - fault->max_level); + fault->gfn, fault->max_level); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -6418,7 +6405,7 @@ restart: */ if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - pfn, PG_LEVEL_NUM)) { + PG_LEVEL_NUM)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ae2d660e2dab..582def531d4d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -309,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level); + int max_level); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index f3a430d64975..d75d93edc40a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1733,7 +1733,6 @@ static void zap_collapsible_spte_range(struct kvm *kvm, gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; - kvm_pfn_t pfn; rcu_read_lock(); @@ -1745,13 +1744,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm, !is_last_spte(iter.old_spte, iter.level)) continue; - /* - * This is a leaf SPTE. Check if the PFN it maps can - * be mapped at a higher level. - */ - pfn = spte_to_pfn(iter.old_spte); max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, pfn, PG_LEVEL_NUM); + iter.gfn, PG_LEVEL_NUM); WARN_ON(max_mapping_level < iter.level); -- cgit v1.2.3 From 65e3b446bcceaac7448cb25a2a5bf4adbcf25fe6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 23:21:05 +0000 Subject: KVM: x86/mmu: Document the "rules" for using host_pfn_mapping_level() Add a comment to document how host_pfn_mapping_level() can be used safely, as the line between safe and dangerous is quite thin. E.g. if KVM were to ever support in-place promotion to create huge pages, consuming the level is safe if the caller holds mmu_lock and checks that there's an existing _leaf_ SPTE, but unsafe if the caller only checks that there's a non-leaf SPTE. Opportunistically tweak the existing comments to explicitly document why KVM needs to use READ_ONCE(). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220715232107.3775620-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 091278a8bd56..8e477333a263 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2920,6 +2920,31 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +/* + * Lookup the mapping level for @gfn in the current mm. + * + * WARNING! Use of host_pfn_mapping_level() requires the caller and the end + * consumer to be tied into KVM's handlers for MMU notifier events! + * + * There are several ways to safely use this helper: + * + * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before + * consuming it. In this case, mmu_lock doesn't need to be held during the + * lookup, but it does need to be held while checking the MMU notifier. + * + * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation + * event for the hva. This can be done by explicit checking the MMU notifier + * or by ensuring that KVM already has a valid mapping that covers the hva. + * + * - Do not use the result to install new mappings, e.g. use the host mapping + * level only to decide whether or not to zap an entry. In this case, it's + * not required to hold mmu_lock (though it's highly likely the caller will + * want to hold mmu_lock anyways, e.g. to modify SPTEs). + * + * Note! The lookup can still race with modifications to host page tables, but + * the above "rules" ensure KVM will not _consume_ the result of the walk if a + * race with the primary MMU occurs. + */ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { @@ -2942,16 +2967,19 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, hva = __gfn_to_hva_memslot(slot, gfn); /* - * Lookup the mapping level in the current mm. The information - * may become stale soon, but it is safe to use as long as - * 1) mmu_notifier_retry was checked after taking mmu_lock, and - * 2) mmu_lock is taken now. - * - * We still need to disable IRQs to prevent concurrent tear down - * of page tables. + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. */ local_irq_save(flags); + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_large() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; -- cgit v1.2.3 From 85f44f8cc07b5f61bef30fe5343d629fd4263230 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 23:21:06 +0000 Subject: KVM: x86/mmu: Don't bottom out on leafs when zapping collapsible SPTEs When zapping collapsible SPTEs in the TDP MMU, don't bottom out on a leaf SPTE now that KVM doesn't require a PFN to compute the host mapping level, i.e. now that there's no need to first find a leaf SPTE and then step back up. Drop the now unused tdp_iter_step_up(), as it is not the safest of helpers (using any of the low level iterators requires some understanding of the various side effects). Signed-off-by: Sean Christopherson Message-Id: <20220715232107.3775620-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_iter.c | 9 -------- arch/x86/kvm/mmu/tdp_iter.h | 1 - arch/x86/kvm/mmu/tdp_mmu.c | 51 +++++++++++++++++++++------------------------ 3 files changed, 24 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 9c65a64a56d9..39b48e7d7d1a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -145,15 +145,6 @@ static bool try_step_up(struct tdp_iter *iter) return true; } -/* - * Step the iterator back up a level in the paging structure. Should only be - * used when the iterator is below the root level. - */ -void tdp_iter_step_up(struct tdp_iter *iter) -{ - WARN_ON(!try_step_up(iter)); -} - /* * Step to the next SPTE in a pre-order traversal of the paging structure. * To get to the next SPTE, the iterator either steps down towards the goal diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index adfca0cf94d3..f0af385c56e0 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -114,6 +114,5 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); -void tdp_iter_step_up(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d75d93edc40a..40ccb5fba870 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1721,10 +1721,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -/* - * Clear leaf entries which could be replaced by large mappings, for - * GFNs within the slot. - */ static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, const struct kvm_memory_slot *slot) @@ -1736,48 +1732,49 @@ static void zap_collapsible_spte_range(struct kvm *kvm, rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { +retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || + !is_shadow_present_pte(iter.old_spte)) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, PG_LEVEL_NUM); - - WARN_ON(max_mapping_level < iter.level); - /* - * If this page is already mapped at the highest - * viable level, there's nothing more to do. + * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with + * a large page size, then its parent would have been zapped + * instead of stepping down. */ - if (max_mapping_level == iter.level) + if (is_last_spte(iter.old_spte, iter.level)) continue; /* - * The page can be remapped at a higher level, so step - * up to zap the parent SPTE. + * If iter.gfn resides outside of the slot, i.e. the page for + * the current level overlaps but is not contained by the slot, + * then the SPTE can't be made huge. More importantly, trying + * to query that info from slot->arch.lpage_info will cause an + * out-of-bounds access. */ - while (max_mapping_level > iter.level) - tdp_iter_step_up(&iter); + if (iter.gfn < start || iter.gfn >= end) + continue; - /* Note, a successful atomic zap also does a remote TLB flush. */ - tdp_mmu_zap_spte_atomic(kvm, &iter); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, + iter.gfn, PG_LEVEL_NUM); + if (max_mapping_level < iter.level) + continue; - /* - * If the atomic zap fails, the iter will recurse back into - * the same subtree to retry. - */ + /* Note, a successful atomic zap also does a remote TLB flush. */ + if (tdp_mmu_zap_spte_atomic(kvm, &iter)) + goto retry; } rcu_read_unlock(); } /* - * Clear non-leaf entries (and free associated page tables) which could - * be replaced by large mappings, for GFNs within the slot. + * Zap non-leaf SPTEs (and free their associated page tables) which could + * be replaced by huge pages, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) -- cgit v1.2.3 From c33f6f2228fe8517e38941a508e9f905f99ecba9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 21:35:50 +0000 Subject: KVM: x86: Split kvm_is_valid_cr4() and export only the non-vendor bits Split the common x86 parts of kvm_is_valid_cr4(), i.e. the reserved bits checks, into a separate helper, __kvm_is_valid_cr4(), and export only the inner helper to vendor code in order to prevent nested VMX from calling back into vmx_is_valid_cr4() via kvm_is_valid_cr4(). On SVM, this is a nop as SVM doesn't place any additional restrictions on CR4. On VMX, this is also currently a nop, but only because nested VMX is missing checks on reserved CR4 bits for nested VM-Enter. That bug will be fixed in a future patch, and could simply use kvm_is_valid_cr4() as-is, but nVMX has _another_ bug where VMXON emulation doesn't enforce VMX's restrictions on CR0/CR4. The cleanest and most intuitive way to fix the VMXON bug is to use nested_host_cr{0,4}_valid(). If the CR4 variant routes through kvm_is_valid_cr4(), using nested_host_cr4_valid() won't do the right thing for the VMXON case as vmx_is_valid_cr4() enforces VMX's restrictions if and only if the vCPU is post-VMXON. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220607213604.3346000-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 3 ++- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 12 +++++++++--- arch/x86/kvm/x86.h | 2 +- 4 files changed, 14 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 23252ab82194..76dcc8a3e849 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -325,7 +325,8 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, return false; } - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + /* Note, SVM doesn't have any additional restrictions on CR4. */ + if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4))) return false; if (CC(!kvm_valid_efer(vcpu, save->efer))) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b0cc911a8f6f..75ee509f0b7b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3238,8 +3238,8 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be - * enabled under SMM. Note, whether or not VMXE is allowed at all is - * handled by kvm_is_valid_cr4(). + * enabled under SMM. Note, whether or not VMXE is allowed at all, + * i.e. is a reserved bit, is handled by common x86 code. */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41aa3137665c..5366f884e9a7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1081,7 +1081,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return false; @@ -1089,9 +1089,15 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return false; - return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); + return true; +} +EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); + +static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return __kvm_is_valid_cr4(vcpu, cr4) && + static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } -EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 501b884b8cc4..1926d2cb8e79 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -434,7 +434,7 @@ static inline void kvm_machine_check(void) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); -- cgit v1.2.3 From ca58f3aa53d165afe4ab74c755bc2f6d168617ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 21:35:51 +0000 Subject: KVM: nVMX: Account for KVM reserved CR4 bits in consistency checks Check that the guest (L2) and host (L1) CR4 values that would be loaded by nested VM-Enter and VM-Exit respectively are valid with respect to KVM's (L0 host) allowed CR4 bits. Failure to check KVM reserved bits would allow L1 to load an illegal CR4 (or trigger hardware VM-Fail or failed VM-Entry) by massaging guest CPUID to allow features that are not supported by KVM. Amusingly, KVM itself is an accomplice in its doom, as KVM adjusts L1's MSR_IA32_VMX_CR4_FIXED1 to allow L1 to enable bits for L2 based on L1's CPUID model. Note, although nested_{guest,host}_cr4_valid() are _currently_ used if and only if the vCPU is post-VMXON (nested.vmxon == true), that may not be true in the future, e.g. emulating VMXON has a bug where it doesn't check the allowed/required CR0/CR4 bits. Cc: stable@vger.kernel.org Fixes: 3899152ccbf4 ("KVM: nVMX: fix checks on CR{0,4} during virtual VMX operation") Signed-off-by: Sean Christopherson Message-Id: <20220607213604.3346000-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index c92cea0b8ccc..129ae4e01f7c 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -281,7 +281,8 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; - return fixed_bits_valid(val, fixed0, fixed1); + return fixed_bits_valid(val, fixed0, fixed1) && + __kvm_is_valid_cr4(vcpu, val); } /* No difference in the restrictions on guest and host CR4 in VMX operation. */ -- cgit v1.2.3 From c7d855c2aff2d511fd60ee2e356134c4fb394799 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 21:35:52 +0000 Subject: KVM: nVMX: Inject #UD if VMXON is attempted with incompatible CR0/CR4 Inject a #UD if L1 attempts VMXON with a CR0 or CR4 that is disallowed per the associated nested VMX MSRs' fixed0/1 settings. KVM cannot rely on hardware to perform the checks, even for the few checks that have higher priority than VM-Exit, as (a) KVM may have forced CR0/CR4 bits in hardware while running the guest, (b) there may incompatible CR0/CR4 bits that have lower priority than VM-Exit, e.g. CR0.NE, and (c) userspace may have further restricted the allowed CR0/CR4 values by manipulating the guest's nested VMX MSRs. Note, despite a very strong desire to throw shade at Jim, commit 70f3aac964ae ("kvm: nVMX: Remove superfluous VMX instruction fault checks") is not to blame for the buggy behavior (though the comment...). That commit only removed the CR0.PE, EFLAGS.VM, and COMPATIBILITY mode checks (though it did erroneously drop the CPL check, but that has already been remedied). KVM may force CR0.PE=1, but will do so only when also forcing EFLAGS.VM=1 to emulate Real Mode, i.e. hardware will still #UD. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216033 Fixes: ec378aeef9df ("KVM: nVMX: Implement VMXON and VMXOFF") Reported-by: Eric Li Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220607213604.3346000-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bfa366938c49..9c3350545b09 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4952,20 +4952,25 @@ static int handle_vmon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * The Intel VMX Instruction Reference lists a bunch of bits that are - * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. But most faulting conditions - * have already been checked by hardware, prior to the VM-exit for - * VMXON. We do test guest cr4.VMXE because processor CR4 always has - * that bit set to 1 in non-root mode. + * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks + * that have higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON), as KVM must load valid CR0/CR4 values into hardware while + * running the guest, i.e. KVM needs to check the _guest_ values. + * + * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and + * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real + * Mode, but KVM will never take the guest out of those modes. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - /* CPL=0 must be checked manually. */ + /* + * CPL=0 and all other checks that are lower priority than VM-Exit must + * be checked manually. + */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); return 1; -- cgit v1.2.3 From a645c2b506fbca06f671b22f0991bd99064d7e69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 21:35:53 +0000 Subject: KVM: nVMX: Rename handle_vm{on,off}() to handle_vmx{on,off}() Rename the exit handlers for VMXON and VMXOFF to match the instruction names, the terms "vmon" and "vmoff" are not used anywhere in Intel's documentation, nor are they used elsehwere in KVM. Sadly, the exit reasons are exposed to userspace and so cannot be renamed without breaking userspace. :-( Fixes: ec378aeef9df ("KVM: nVMX: Implement VMXON and VMXOFF") Signed-off-by: Sean Christopherson Message-Id: <20220607213604.3346000-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 9c3350545b09..1e760994d207 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4942,7 +4942,7 @@ out_vmcs02: } /* Emulate the VMXON instruction. */ -static int handle_vmon(struct kvm_vcpu *vcpu) +static int handle_vmxon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; @@ -5039,7 +5039,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) } /* Emulate the VMXOFF instruction */ -static int handle_vmoff(struct kvm_vcpu *vcpu) +static int handle_vmxoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; @@ -6816,8 +6816,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; - exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; - exit_handlers[EXIT_REASON_VMON] = handle_vmon; + exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; + exit_handlers[EXIT_REASON_VMON] = handle_vmxon; exit_handlers[EXIT_REASON_INVEPT] = handle_invept; exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; -- cgit v1.2.3 From f8ae08f9789ad59d318ea75b570caa454aceda81 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 21:35:54 +0000 Subject: KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value Restrict the nVMX MSRs based on KVM's config, not based on the guest's current config. Using the guest's config to audit the new config prevents userspace from restoring the original config (KVM's config) if at any point in the past the guest's config was restricted in any way. Fixes: 62cc6b9dc61e ("KVM: nVMX: support restore of VMX capability MSRs") Cc: stable@vger.kernel.org Cc: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20220607213604.3346000-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 70 +++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1e760994d207..c1c85fd75d42 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1224,7 +1224,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | /* reserved */ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.msrs.basic; + u64 vmx_basic = vmcs_config.nested.basic; if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) return -EINVAL; @@ -1247,36 +1247,42 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) return 0; } -static int -vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, + u32 **low, u32 **high) { - u64 supported; - u32 *lowp, *highp; - switch (msr_index) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.msrs.pinbased_ctls_low; - highp = &vmx->nested.msrs.pinbased_ctls_high; + *low = &msrs->pinbased_ctls_low; + *high = &msrs->pinbased_ctls_high; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.msrs.procbased_ctls_low; - highp = &vmx->nested.msrs.procbased_ctls_high; + *low = &msrs->procbased_ctls_low; + *high = &msrs->procbased_ctls_high; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.msrs.exit_ctls_low; - highp = &vmx->nested.msrs.exit_ctls_high; + *low = &msrs->exit_ctls_low; + *high = &msrs->exit_ctls_high; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.msrs.entry_ctls_low; - highp = &vmx->nested.msrs.entry_ctls_high; + *low = &msrs->entry_ctls_low; + *high = &msrs->entry_ctls_high; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.msrs.secondary_ctls_low; - highp = &vmx->nested.msrs.secondary_ctls_high; + *low = &msrs->secondary_ctls_low; + *high = &msrs->secondary_ctls_high; break; default: BUG(); } +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u32 *lowp, *highp; + u64 supported; + + vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); supported = vmx_control_msr(*lowp, *highp); @@ -1288,6 +1294,7 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) return -EINVAL; + vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); *lowp = data; *highp = data >> 32; return 0; @@ -1301,10 +1308,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | /* reserved */ GENMASK_ULL(13, 9) | BIT_ULL(31); - u64 vmx_misc; - - vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, + vmcs_config.nested.misc_high); if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) return -EINVAL; @@ -1332,10 +1337,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) { - u64 vmx_ept_vpid_cap; - - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, - vmx->nested.msrs.vpid_caps); + u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, + vmcs_config.nested.vpid_caps); /* Every bit is either reserved or a feature bit. */ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) @@ -1346,20 +1349,21 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) return 0; } -static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) { - u64 *msr; - switch (msr_index) { case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.msrs.cr0_fixed0; - break; + return &msrs->cr0_fixed0; case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.msrs.cr4_fixed0; - break; + return &msrs->cr4_fixed0; default: BUG(); } +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); /* * 1 bits (which indicates bits which "must-be-1" during VMX operation) @@ -1368,7 +1372,7 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(data, *msr, -1ULL)) return -EINVAL; - *msr = data; + *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; return 0; } @@ -1429,7 +1433,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmx->nested.msrs.vmcs_enum = data; return 0; case MSR_IA32_VMX_VMFUNC: - if (data & ~vmx->nested.msrs.vmfunc_controls) + if (data & ~vmcs_config.nested.vmfunc_controls) return -EINVAL; vmx->nested.msrs.vmfunc_controls = data; return 0; -- cgit v1.2.3 From 8805875aa4732340af49aa601f59a60e482f635f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Jul 2022 05:07:39 -0400 Subject: Revert "KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled" Since commit 5f76f6f5ff96 ("KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled"), KVM has taken ownership of the "load IA32_BNDCFGS" and "clear IA32_BNDCFGS" VMX entry/exit controls, trying to set these bits in the IA32_VMX_TRUE_{ENTRY,EXIT}_CTLS MSRs if the guest's CPUID supports MPX, and clear otherwise. The intent of the patch was to apply it to L0 in order to work around L1 kernels that lack the fix in commit 691bd4340bef ("kvm: vmx: allow host to access guest MSR_IA32_BNDCFGS", 2017-07-04): by hiding the control bits from L0, L1 hides BNDCFGS from KVM_GET_MSR_INDEX_LIST, and the L1 bug is neutralized even in the lack of commit 691bd4340bef. This was perhaps a sensible kludge at the time, but a horrible idea in the long term and in fact it has not been extended to other CPUID bits like these: X86_FEATURE_LM => VM_EXIT_HOST_ADDR_SPACE_SIZE, VM_ENTRY_IA32E_MODE, VMX_MISC_SAVE_EFER_LMA X86_FEATURE_TSC => CPU_BASED_RDTSC_EXITING, CPU_BASED_USE_TSC_OFFSETTING, SECONDARY_EXEC_TSC_SCALING X86_FEATURE_INVPCID_SINGLE => SECONDARY_EXEC_ENABLE_INVPCID X86_FEATURE_MWAIT => CPU_BASED_MONITOR_EXITING, CPU_BASED_MWAIT_EXITING X86_FEATURE_INTEL_PT => SECONDARY_EXEC_PT_CONCEAL_VMX, SECONDARY_EXEC_PT_USE_GPA, VM_EXIT_CLEAR_IA32_RTIT_CTL, VM_ENTRY_LOAD_IA32_RTIT_CTL X86_FEATURE_XSAVES => SECONDARY_EXEC_XSAVES These days it's sort of common knowledge that any MSR in KVM_GET_MSR_INDEX_LIST must allow *at least* setting it with KVM_SET_MSR to a default value, so it is unlikely that something like commit 5f76f6f5ff96 will be needed again. So revert it, at the potential cost of breaking L1s with a 6 year old kernel. While in principle the L0 owner doesn't control what runs on L1, such an old hypervisor would probably have many other bugs. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 75ee509f0b7b..4fd25e1d6ec9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7457,23 +7457,6 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } -static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (kvm_mpx_supported()) { - bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); - - if (mpx_enabled) { - vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - } else { - vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; - } - } -} - static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7565,10 +7548,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (nested_vmx_allowed(vcpu)) { + if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); - nested_vmx_entry_exit_ctls_update(vcpu); - } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) -- cgit v1.2.3 From 93255bf92939d948bc86d81c6bb70bb0fecc5db1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 22 Jul 2022 22:44:06 +0000 Subject: KVM: VMX: Mark all PERF_GLOBAL_(OVF)_CTRL bits reserved if there's no vPMU Mark all MSR_CORE_PERF_GLOBAL_CTRL and MSR_CORE_PERF_GLOBAL_OVF_CTRL bits as reserved if there is no guest vPMU. The nVMX VM-Entry consistency checks do not check for a valid vPMU prior to consuming the masks via kvm_valid_perf_global_ctrl(), i.e. may incorrectly allow a non-zero mask to be loaded via VM-Enter or VM-Exit (well, attempted to be loaded, the actual MSR load will be rejected by intel_is_valid_msr()). Fixes: f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220722224409.1336532-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 4bc098fbec31..6e355c5d2f40 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -527,6 +527,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_mask = ~0ull; + pmu->global_ovf_ctrl_mask = ~0ull; pmu->fixed_ctr_ctrl_mask = ~0ull; pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; -- cgit v1.2.3 From b663f0b5f3d665c261256d1f76e98f077c6e56af Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 22 Jul 2022 22:44:07 +0000 Subject: KVM: VMX: Add helper to check if the guest PMU has PERF_GLOBAL_CTRL Add a helper to check of the guest PMU has PERF_GLOBAL_CTRL, which is unintuitive _and_ diverges from Intel's architecturally defined behavior. Even worse, KVM currently implements the check using two different (but equivalent) checks, _and_ there has been at least one attempt to add a _third_ flavor. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220722224409.1336532-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- arch/x86/kvm/vmx/vmx.h | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 6e355c5d2f40..78f2800fd850 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -111,7 +111,7 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (pmu->version < 2) + if (!intel_pmu_has_perf_global_ctrl(pmu)) return true; return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); @@ -207,7 +207,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_STATUS: case MSR_CORE_PERF_GLOBAL_CTRL: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; + return intel_pmu_has_perf_global_ctrl(pmu); break; case MSR_IA32_PEBS_ENABLE: ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 286c88e285ea..2a0b94e0fda7 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -91,6 +91,18 @@ union vmx_exit_reason { u32 full; }; +static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + */ + return pmu->version > 1; +} + #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) -- cgit v1.2.3 From 4496a6f9b45e8cd83343ad86a3984d614e22cf54 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 22 Jul 2022 22:44:08 +0000 Subject: KVM: nVMX: Attempt to load PERF_GLOBAL_CTRL on nVMX xfer iff it exists Attempt to load PERF_GLOBAL_CTRL during nested VM-Enter/VM-Exit if and only if the MSR exists (according to the guest vCPU model). KVM has very misguided handling of VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL and attempts to force the nVMX MSR settings to match the vPMU model, i.e. to hide/expose the control based on whether or not the MSR exists from the guest's perspective. KVM's modifications fail to handle the scenario where the vPMU is hidden from the guest _after_ being exposed to the guest, e.g. by userspace doing multiple KVM_SET_CPUID2 calls, which is allowed if done before any KVM_RUN. nested_vmx_pmu_refresh() is called if and only if there's a recognized vPMU, i.e. KVM will leave the bits in the allow state and then ultimately reject the MSR load and WARN. KVM should not force the VMX MSRs in the first place. KVM taking control of the MSRs was a misguided attempt at mimicking what commit 5f76f6f5ff96 ("KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled", 2018-10-01) did for MPX. However, the MPX commit was a workaround for another KVM bug and not something that should be imitated (and it should never been done in the first place). In other words, KVM's ABI _should_ be that userspace has full control over the MSRs, at which point triggering the WARN that loading the MSR must not fail is trivial. The intent of the WARN is still valid; KVM has consistency checks to ensure that vmcs12->{guest,host}_ia32_perf_global_ctrl is valid. The problem is that '0' must be considered a valid value at all times, and so the simple/obvious solution is to just not actually load the MSR when it does not exist. It is userspace's responsibility to provide a sane vCPU model, i.e. KVM is well within its ABI and Intel's VMX architecture to skip the loads if the MSR does not exist. Fixes: 03a8871add95 ("KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL VM-{Entry,Exit} control") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220722224409.1336532-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c1c85fd75d42..819cfd926954 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2623,6 +2623,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -4333,7 +4334,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); -- cgit v1.2.3 From 9389d5774aca444b6343d3b5f9b05f0820fe705e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Jul 2022 22:44:09 +0000 Subject: Revert "KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL VM-{Entry,Exit} control" This reverts commit 03a8871add95213827e2bea84c12133ae5df952e. Since commit 03a8871add95 ("KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL VM-{Entry,Exit} control"), KVM has taken ownership of the "load IA32_PERF_GLOBAL_CTRL" VMX entry/exit control bits, trying to set these bits in the IA32_VMX_TRUE_{ENTRY,EXIT}_CTLS MSRs if the guest's CPUID supports the architectural PMU (CPUID[EAX=0Ah].EAX[7:0]=1), and clear otherwise. This was a misguided attempt at mimicking what commit 5f76f6f5ff96 ("KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled", 2018-10-01) did for MPX. However, that commit was a workaround for another KVM bug and not something that should be imitated. Mucking with the VMX MSRs creates a subtle, difficult to maintain ABI as KVM must ensure that any internal changes, e.g. to how KVM handles _any_ guest CPUID changes, yield the same functional result. Therefore, KVM's policy is to let userspace have full control of the guest vCPU model so long as the host kernel is not at risk. Now that KVM really truly ensures kvm_set_msr() will succeed by loading PERF_GLOBAL_CTRL if and only if it exists, revert KVM's misguided and roundabout behavior. Signed-off-by: Paolo Bonzini [sean: make it a pure revert] Signed-off-by: Sean Christopherson Message-Id: <20220722224409.1336532-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 22 ---------------------- arch/x86/kvm/vmx/nested.h | 2 -- arch/x86/kvm/vmx/pmu_intel.c | 3 --- 3 files changed, 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 819cfd926954..ebeeddb6aeb1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4824,28 +4824,6 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl) -{ - struct vcpu_vmx *vmx; - - if (!nested_vmx_allowed(vcpu)) - return; - - vmx = to_vmx(vcpu); - if (vcpu_has_perf_global_ctrl) { - vmx->nested.msrs.entry_ctls_high |= - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high |= - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } else { - vmx->nested.msrs.entry_ctls_high &= - ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high &= - ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } -} - static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, int *ret) { diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 129ae4e01f7c..88b00a7359e4 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,8 +32,6 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 78f2800fd850..862c1a4d971b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -592,9 +592,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_refresh(vcpu, - intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); - if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); else -- cgit v1.2.3 From a910b5ab6b250a88fff1866bf708642d83317466 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 21:36:00 +0000 Subject: KVM: nVMX: Set UMIP bit CR4_FIXED1 MSR when emulating UMIP Make UMIP an "allowed-1" bit CR4_FIXED1 MSR when KVM is emulating UMIP. KVM emulates UMIP for both L1 and L2, and so should enumerate that L2 is allowed to have CR4.UMIP=1. Not setting the bit doesn't immediately break nVMX, as KVM does set/clear the bit in CR4_FIXED1 in response to a guest CPUID update, i.e. KVM will correctly (dis)allow nested VM-Entry based on whether or not UMIP is exposed to L1. That said, KVM should enumerate the bit as being allowed from time zero, e.g. userspace will see the wrong value if the MSR is read before CPUID is written. Fixes: 0367f205a3b7 ("KVM: vmx: add support for emulating UMIP") Signed-off-by: Sean Christopherson Message-Id: <20220607213604.3346000-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ebeeddb6aeb1..ed247a121325 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6757,6 +6757,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + if (vmx_umip_emulated()) + msrs->cr4_fixed1 |= X86_CR4_UMIP; + msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } -- cgit v1.2.3 From 0a8735a6acf36ac35499563dc44f3e3d5034a2ce Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Sun, 24 Jul 2022 22:34:28 -0500 Subject: KVM: SVM: Do not virtualize MSR accesses for APIC LVTT register AMD does not support APIC TSC-deadline timer mode. AVIC hardware will generate GP fault when guest kernel writes 1 to bits [18] of the APIC LVTT register (offset 0x32) to set the timer mode. (Note: bit 18 is reserved on AMD system). Therefore, always intercept and let KVM emulate the MSR accesses. Fixes: f3d7c8aa6882 ("KVM: SVM: Fix x2APIC MSRs interception") Signed-off-by: Suravee Suthikulpanit Message-Id: <20220725033428.3699-1-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index aef63aae922d..3e0639a68385 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -118,7 +118,14 @@ static const struct svm_direct_access_msrs { { .index = X2APIC_MSR(APIC_ESR), .always = false }, { .index = X2APIC_MSR(APIC_ICR), .always = false }, { .index = X2APIC_MSR(APIC_ICR2), .always = false }, - { .index = X2APIC_MSR(APIC_LVTT), .always = false }, + + /* + * Note: + * AMD does not virtualize APIC TSC-deadline timer mode, but it is + * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, + * the AVIC hardware would generate GP fault. Therefore, always + * intercept the MSR 0x832, and do not setup direct_access_msr. + */ { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, { .index = X2APIC_MSR(APIC_LVT0), .always = false }, -- cgit v1.2.3 From 1bd9dfec9fd419920572b057e2c98d9877190b06 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 25 Jul 2022 00:33:56 -0500 Subject: KVM: x86: Do not block APIC write for non ICR registers The commit 5413bcba7ed5 ("KVM: x86: Add support for vICR APIC-write VM-Exits in x2APIC mode") introduces logic to prevent APIC write for offset other than ICR in kvm_apic_write_nodecode() function. This breaks x2AVIC support, which requires KVM to trap and emulate x2APIC MSR writes. Therefore, removes the warning and modify to logic to allow MSR write. Fixes: 5413bcba7ed5 ("KVM: x86: Add support for vICR APIC-write VM-Exits in x2APIC mode") Cc: Zeng Guang Suggested-by: Sean Christopherson Signed-off-by: Suravee Suthikulpanit Message-Id: <20220725053356.4275-1-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9d4f73c4dc02..e2ce3556915e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -69,6 +69,7 @@ static bool lapic_timer_advance_dynamic __read_mostly; /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); +static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) { @@ -2283,21 +2284,20 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) struct kvm_lapic *apic = vcpu->arch.apic; u64 val; - if (apic_x2apic_mode(apic)) { - /* - * When guest APIC is in x2APIC mode and IPI virtualization - * is enabled, accessing APIC_ICR may cause trap-like VM-exit - * on Intel hardware. Other offsets are not possible. - */ - if (WARN_ON_ONCE(offset != APIC_ICR)) - return; - + if (apic_x2apic_mode(apic)) kvm_lapic_msr_read(apic, offset, &val); + else + val = kvm_lapic_get_reg(apic, offset); + + /* + * ICR is a single 64-bit register when x2APIC is enabled. For legacy + * xAPIC, ICR writes need to go down the common (slightly slower) path + * to get the upper half from ICR2. + */ + if (apic_x2apic_mode(apic) && offset == APIC_ICR) { kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); trace_kvm_apic_write(APIC_ICR, val); } else { - val = kvm_lapic_get_reg(apic, offset); - /* TODO: optimize to just emulate side effect w/o one more write */ kvm_lapic_reg_write(apic, offset, (u32)val); } -- cgit v1.2.3 From 6c6ab524cfae0799e55c82b2c1d61f1af0156f8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 23 Jul 2022 01:30:29 +0000 Subject: KVM: x86/mmu: Treat NX as a valid SPTE bit for NPT Treat the NX bit as valid when using NPT, as KVM will set the NX bit when the NX huge page mitigation is enabled (mindblowing) and trigger the WARN that fires on reserved SPTE bits being set. KVM has required NX support for SVM since commit b26a71a1a5b9 ("KVM: SVM: Refuse to load kvm_amd if NX support is not available") for exactly this reason, but apparently it never occurred to anyone to actually test NPT with the mitigation enabled. ------------[ cut here ]------------ spte = 0x800000018a600ee7, level = 2, rsvd bits = 0x800f0000001fe000 WARNING: CPU: 152 PID: 15966 at arch/x86/kvm/mmu/spte.c:215 make_spte+0x327/0x340 [kvm] Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 10.48.0 01/27/2022 RIP: 0010:make_spte+0x327/0x340 [kvm] Call Trace: tdp_mmu_map_handle_target_level+0xc3/0x230 [kvm] kvm_tdp_mmu_map+0x343/0x3b0 [kvm] direct_page_fault+0x1ae/0x2a0 [kvm] kvm_tdp_page_fault+0x7d/0x90 [kvm] kvm_mmu_page_fault+0xfb/0x2e0 [kvm] npf_interception+0x55/0x90 [kvm_amd] svm_invoke_exit_handler+0x31/0xf0 [kvm_amd] svm_handle_exit+0xf6/0x1d0 [kvm_amd] vcpu_enter_guest+0xb6d/0xee0 [kvm] ? kvm_pmu_trigger_event+0x6d/0x230 [kvm] vcpu_run+0x65/0x2c0 [kvm] kvm_arch_vcpu_ioctl_run+0x355/0x610 [kvm] kvm_vcpu_ioctl+0x551/0x610 [kvm] __se_sys_ioctl+0x77/0xc0 __x64_sys_ioctl+0x1d/0x20 do_syscall_64+0x44/0xa0 entry_SYSCALL_64_after_hwframe+0x46/0xb0 ---[ end trace 0000000000000000 ]--- Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220723013029.1753623-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8e477333a263..3e1317325e1f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4735,7 +4735,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), - context->root_role.level, false, + context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else -- cgit v1.2.3 From 6fac42f127b8e8464b8c13036dfed825981881d9 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 28 Jul 2022 08:09:19 +0300 Subject: KVM: SVM: Dump Virtual Machine Save Area (VMSA) to klog As Virtual Machine Save Area (VMSA) is essential in troubleshooting attestation, dump it to the klog with the KERN_DEBUG level of priority. Cc: Jarkko Sakkinen Suggested-by: Harald Hoyer Signed-off-by: Jarkko Sakkinen Message-Id: <20220728050919.24113-1-jarkko@profian.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 309bcdb2f929..1b70e6d68113 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -603,6 +603,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; + pr_debug("Virtual Machine Save Area (VMSA):\n"); + print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); + return 0; } -- cgit v1.2.3 From 7edc3a68038ab151a8791ddb6217755a5e4a5809 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 28 Jul 2022 15:04:52 +1200 Subject: KVM, x86/mmu: Fix the comment around kvm_tdp_mmu_zap_leafs() Now kvm_tdp_mmu_zap_leafs() only zaps leaf SPTEs but not any non-root pages within that GFN range anymore, so the comment around it isn't right. Fix it by shifting the comment from tdp_mmu_zap_leafs() instead of duplicating it, as tdp_mmu_zap_leafs() is static and is only called by kvm_tdp_mmu_zap_leafs(). Opportunistically tweak the blurb about SPTEs being cleared to (a) say "zapped" instead of "cleared" because "cleared" will be wrong if/when KVM allows a non-zero value for non-present SPTE (i.e. for Intel TDX), and (b) to clarify that a flush is needed if and only if a SPTE has been zapped since MMU lock was last acquired. Fixes: f47e5bbbc92f ("KVM: x86/mmu: Zap only TDP MMU leafs in zap range and mmu_notifier unmap") Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <20220728030452.484261-1-kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 40ccb5fba870..bf2ccf9debca 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -924,9 +924,6 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) } /* - * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs - * have been cleared and a TLB flush is needed before releasing the MMU lock. - * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and @@ -969,10 +966,9 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Tears down the mappings for the range of gfns, [start, end), and frees the - * non-root pages mapping GFNs strictly within that range. Returns true if - * SPTEs have been cleared and a TLB flush is needed before releasing the - * MMU lock. + * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns + * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or + * more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, bool can_yield, bool flush) -- cgit v1.2.3 From 31f6e3832a0f1c366e54033335aed2375f6e447a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 1 Aug 2022 07:27:18 -0400 Subject: KVM: x86/mmu: remove unused variable The last use of 'pfn' went away with the same-named argument to host_pfn_mapping_level; now that the hugepage level is obtained exclusively from the host page tables, kvm_mmu_zap_collapsible_spte does not need to know host pfns at all. Fixes: a8ac499bb6ab ("KVM: x86/mmu: Don't require refcounted "struct page" to create huge SPTEs") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3e1317325e1f..4236a28b9be5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6416,13 +6416,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; - kvm_pfn_t pfn; struct kvm_mmu_page *sp; restart: for_each_rmap_spte(rmap_head, &iter, sptep) { sp = sptep_to_sp(sptep); - pfn = spte_to_pfn(*sptep); /* * We cannot do huge page mapping for indirect shadow pages, -- cgit v1.2.3