From 4f13d471e5d11034d56161af56d0f9396bc0b384 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 7 Jun 2021 06:15:32 +0000 Subject: KVM: SVM: Fix SEV SEND_START session length & SEND_UPDATE_DATA query length after commit 238eca821cee Commit 238eca821cee ("KVM: SVM: Allocate SEV command structures on local stack") uses the local stack to allocate the structures used to communicate with the PSP, which were earlier being kzalloced. This breaks SEV live migration for computing the SEND_START session length and SEND_UPDATE_DATA query length as session_len and trans_len and hdr_len fields are not zeroed respectively for the above commands before issuing the SEV Firmware API call, hence the firmware returns incorrect session length and update data header or trans length. Also the SEV Firmware API returns SEV_RET_INVALID_LEN firmware error for these length query API calls, and the return value and the firmware error needs to be passed to the userspace as it is, so need to remove the return check in the KVM code. Signed-off-by: Ashish Kalra Message-Id: <20210607061532.27459-1-Ashish.Kalra@amd.com> Fixes: 238eca821cee ("KVM: SVM: Allocate SEV command structures on local stack") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5bc887e9a986..e0ce5da97fc2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1103,10 +1103,9 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_start data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); - if (ret < 0) - return ret; params->session_len = data.session_len; if (copy_to_user((void __user *)(uintptr_t)argp->data, params, @@ -1215,10 +1214,9 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_update_data data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); - if (ret < 0) - return ret; params->hdr_len = data.hdr_len; params->trans_len = data.trans_len; -- cgit v1.2.3 From e898da784aed0ea65f7672d941c01dc9b79e6299 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 7 Jun 2021 00:19:43 -0700 Subject: KVM: LAPIC: Write 0 to TMICT should also cancel vmx-preemption timer According to the SDM 10.5.4.1: A write of 0 to the initial-count register effectively stops the local APIC timer, in both one-shot and periodic mode. However, the lapic timer oneshot/periodic mode which is emulated by vmx-preemption timer doesn't stop by writing 0 to TMICT since vmx->hv_deadline_tsc is still programmed and the guest will receive the spurious timer interrupt later. This patch fixes it by also cancelling the vmx-preemption timer when writing 0 to the initial-count register. Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li Message-Id: <1623050385-100988-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8120e8614b92..6d72d8f43310 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1494,6 +1494,15 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) static void cancel_hv_timer(struct kvm_lapic *apic); +static void cancel_apic_timer(struct kvm_lapic *apic) +{ + hrtimer_cancel(&apic->lapic_timer.timer); + preempt_disable(); + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + preempt_enable(); +} + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & @@ -1502,11 +1511,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { - hrtimer_cancel(&apic->lapic_timer.timer); - preempt_disable(); - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - preempt_enable(); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; @@ -2092,7 +2097,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) if (apic_lvtt_tscdeadline(apic)) break; - hrtimer_cancel(&apic->lapic_timer.timer); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; -- cgit v1.2.3 From b1bd5cba3306691c771d558e94baa73e8b0b96b7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 3 Jun 2021 13:24:55 +0800 Subject: KVM: X86: MMU: Use the correct inherited permissions to get shadow page When computing the access permissions of a shadow page, use the effective permissions of the walk up to that point, i.e. the logic AND of its parents' permissions. Two guest PxE entries that point at the same table gfn need to be shadowed with different shadow pages if their parents' permissions are different. KVM currently uses the effective permissions of the last non-leaf entry for all non-leaf entries. Because all non-leaf SPTEs have full ("uwx") permissions, and the effective permissions are recorded only in role.access and merged into the leaves, this can lead to incorrect reuse of a shadow page and eventually to a missing guest protection page fault. For example, here is a shared pagetable: pgd[] pud[] pmd[] virtual address pointers /->pmd1(u--)->pte1(uw-)->page1 <- ptr1 (u--) /->pud1(uw-)--->pmd2(uw-)->pte2(uw-)->page2 <- ptr2 (uw-) pgd-| (shared pmd[] as above) \->pud2(u--)--->pmd1(u--)->pte1(uw-)->page1 <- ptr3 (u--) \->pmd2(uw-)->pte2(uw-)->page2 <- ptr4 (u--) pud1 and pud2 point to the same pmd table, so: - ptr1 and ptr3 points to the same page. - ptr2 and ptr4 points to the same page. (pud1 and pud2 here are pud entries, while pmd1 and pmd2 here are pmd entries) - First, the guest reads from ptr1 first and KVM prepares a shadow page table with role.access=u--, from ptr1's pud1 and ptr1's pmd1. "u--" comes from the effective permissions of pgd, pud1 and pmd1, which are stored in pt->access. "u--" is used also to get the pagetable for pud1, instead of "uw-". - Then the guest writes to ptr2 and KVM reuses pud1 which is present. The hypervisor set up a shadow page for ptr2 with pt->access is "uw-" even though the pud1 pmd (because of the incorrect argument to kvm_mmu_get_page in the previous step) has role.access="u--". - Then the guest reads from ptr3. The hypervisor reuses pud1's shadow pmd for pud2, because both use "u--" for their permissions. Thus, the shadow pmd already includes entries for both pmd1 and pmd2. - At last, the guest writes to ptr4. This causes no vmexit or pagefault, because pud1's shadow page structures included an "uw-" page even though its role.access was "u--". Any kind of shared pagetable might have the similar problem when in virtual machine without TDP enabled if the permissions are different from different ancestors. In order to fix the problem, we change pt->access to be an array, and any access in it will not include permissions ANDed from child ptes. The test code is: https://lore.kernel.org/kvm/20210603050537.19605-1-jiangshanlai@gmail.com/ Remember to test it with TDP disabled. The problem had existed long before the commit 41074d07c78b ("KVM: MMU: Fix inherited permissions for emulated guest pte updates"), and it is hard to find which is the culprit. So there is no fixes tag here. Signed-off-by: Lai Jiangshan Message-Id: <20210603052455.21023-1-jiangshanlai@gmail.com> Cc: stable@vger.kernel.org Fixes: cea0f0e7ea54 ("[PATCH] KVM: MMU: Shadow page table caching") Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/mmu.rst | 4 ++-- arch/x86/kvm/mmu/paging_tmpl.h | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index 5bfe28b0728e..20d85daed395 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -171,8 +171,8 @@ Shadow pages contain the following information: shadow pages) so role.quadrant takes values in the range 0..3. Each quadrant maps 1GB virtual address space. role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. + Inherited guest access permissions from the parent ptes in the form uwx. + Note execute permission is positive, not negative. role.invalid: The page is invalid and should not be used. It is a root page that is currently pinned (by a cpu hardware register pointing to it); once it is diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 70b7e44e3035..823a5919f9fa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -90,8 +90,8 @@ struct guest_walker { gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; bool pte_writable[PT_MAX_FULL_LEVELS]; - unsigned pt_access; - unsigned pte_access; + unsigned int pt_access[PT_MAX_FULL_LEVELS]; + unsigned int pte_access; gfn_t gfn; struct x86_exception fault; }; @@ -418,13 +418,15 @@ retry_walk: } walker->ptes[walker->level - 1] = pte; + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ + walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); } while (!is_last_gpte(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ - walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) @@ -463,7 +465,8 @@ retry_walk: } pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, walker->pte_access, walker->pt_access); + __func__, (u64)pte, walker->pte_access, + walker->pt_access[walker->level - 1]); return 1; error: @@ -643,7 +646,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; - unsigned direct_access, access = gw->pt_access; + unsigned int direct_access, access; int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; @@ -675,6 +678,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = NULL; if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, false, access); } -- cgit v1.2.3 From af3511ff7fa2107d6410831f3d71030f5e8d2b25 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 1 Jun 2021 01:46:28 +0800 Subject: KVM: x86: Ensure PV TLB flush tracepoint reflects KVM behavior In record_steal_time(), st->preempted is read twice, and trace_kvm_pv_tlb_flush() might output result inconsistent if kvm_vcpu_flush_tlb_guest() see a different st->preempted later. It is a very trivial problem and hardly has actual harm and can be avoided by reseting and reading st->preempted in atomic way via xchg(). Signed-off-by: Lai Jiangshan Message-Id: <20210531174628.10265-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1cd6d4685932..e2144eedaf79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3101,9 +3101,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + u8 st_preempted = xchg(&st->preempted, 0); + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + st_preempted & KVM_VCPU_FLUSH_TLB); + if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); } else { st->preempted = 0; -- cgit v1.2.3 From f31500b0d437a2464ca5972d8f5439e156b74960 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 7 Jun 2021 10:57:48 -0700 Subject: KVM: x86: Ensure liveliness of nested VM-Enter fail tracepoint message Use the __string() machinery provided by the tracing subystem to make a copy of the string literals consumed by the "nested VM-Enter failed" tracepoint. A complete copy is necessary to ensure that the tracepoint can't outlive the data/memory it consumes and deference stale memory. Because the tracepoint itself is defined by kvm, if kvm-intel and/or kvm-amd are built as modules, the memory holding the string literals defined by the vendor modules will be freed when the module is unloaded, whereas the tracepoint and its data in the ring buffer will live until kvm is unloaded (or "indefinitely" if kvm is built-in). This bug has existed since the tracepoint was added, but was recently exposed by a new check in tracing to detect exactly this type of bug. fmt: '%s%s ' current_buffer: ' vmx_dirty_log_t-140127 [003] .... kvm_nested_vmenter_failed: ' WARNING: CPU: 3 PID: 140134 at kernel/trace/trace.c:3759 trace_check_vprintf+0x3be/0x3e0 CPU: 3 PID: 140134 Comm: less Not tainted 5.13.0-rc1-ce2e73ce600a-req #184 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:trace_check_vprintf+0x3be/0x3e0 Code: <0f> 0b 44 8b 4c 24 1c e9 a9 fe ff ff c6 44 02 ff 00 49 8b 97 b0 20 RSP: 0018:ffffa895cc37bcb0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffa895cc37bd08 RCX: 0000000000000027 RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff9766cfad74f8 RBP: ffffffffc0a041d4 R08: ffff9766cfad74f0 R09: ffffa895cc37bad8 R10: 0000000000000001 R11: 0000000000000001 R12: ffffffffc0a041d4 R13: ffffffffc0f4dba8 R14: 0000000000000000 R15: ffff976409f2c000 FS: 00007f92fa200740(0000) GS:ffff9766cfac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559bd11b0000 CR3: 000000019fbaa002 CR4: 00000000001726e0 Call Trace: trace_event_printf+0x5e/0x80 trace_raw_output_kvm_nested_vmenter_failed+0x3a/0x60 [kvm] print_trace_line+0x1dd/0x4e0 s_show+0x45/0x150 seq_read_iter+0x2d5/0x4c0 seq_read+0x106/0x150 vfs_read+0x98/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x40/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Cc: Steven Rostedt Fixes: 380e0055bc7e ("KVM: nVMX: trace nested VM-Enter failures detected by H/W") Signed-off-by: Sean Christopherson Reviewed-by: Steven Rostedt (VMware) Message-Id: <20210607175748.674002-1-seanjc@google.com> --- arch/x86/kvm/trace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a61c015870e3..4f839148948b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1550,16 +1550,16 @@ TRACE_EVENT(kvm_nested_vmenter_failed, TP_ARGS(msg, err), TP_STRUCT__entry( - __field(const char *, msg) + __string(msg, msg) __field(u32, err) ), TP_fast_assign( - __entry->msg = msg; + __assign_str(msg, msg); __entry->err = err; ), - TP_printk("%s%s", __entry->msg, !__entry->err ? "" : + TP_printk("%s%s", __get_str(msg), !__entry->err ? "" : __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); -- cgit v1.2.3 From b53e84eed08b88fd3ff59e5c2a7f1a69d4004e32 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 1 Jun 2021 01:22:56 +0800 Subject: KVM: x86: Unload MMU on guest TLB flush if TDP disabled to force MMU sync When using shadow paging, unload the guest MMU when emulating a guest TLB flush to ensure all roots are synchronized. From the guest's perspective, flushing the TLB ensures any and all modifications to its PTEs will be recognized by the CPU. Note, unloading the MMU is overkill, but is done to mirror KVM's existing handling of INVPCID(all) and ensure the bug is squashed. Future cleanup can be done to more precisely synchronize roots when servicing a guest TLB flush. If TDP is enabled, synchronizing the MMU is unnecessary even if nested TDP is in play, as a "legacy" TLB flush from L1 does not invalidate L1's TDP mappings. For EPT, an explicit INVEPT is required to invalidate guest-physical mappings; for NPT, guest mappings are always tagged with an ASID and thus can only be invalidated via the VMCB's ASID control. This bug has existed since the introduction of KVM_VCPU_FLUSH_TLB. It was only recently exposed after Linux guests stopped flushing the local CPU's TLB prior to flushing remote TLBs (see commit 4ce94eabac16, "x86/mm/tlb: Flush remote and local TLBs concurrently"), but is also visible in Windows 10 guests. Tested-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Fixes: f38a7b75267f ("KVM: X86: support paravirtualized help for TLB shootdowns") Signed-off-by: Lai Jiangshan [sean: massaged comment and changelog] Message-Id: <20210531172256.2908-1-jiangshanlai@gmail.com> Signed-off-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e2144eedaf79..9dd23bdfc6cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3072,6 +3072,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; + + if (!tdp_enabled) { + /* + * A TLB flush on behalf of the guest is equivalent to + * INVPCID(all), toggling CR4.PGE, etc., which requires + * a forced sync of the shadow page tables. Unload the + * entire MMU here and the subsequent load will sync the + * shadow page tables, and also flush the TLB. + */ + kvm_mmu_unload(vcpu); + return; + } + static_call(kvm_x86_tlb_flush_guest)(vcpu); } -- cgit v1.2.3