From 6011cf68c88545e16cb32039c2cecfdae6a32315 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Jul 2021 16:35:48 +0100 Subject: KVM: arm64: Walk userspace page tables to compute the THP mapping size We currently rely on the kvm_is_transparent_hugepage() helper to discover whether a given page has the potential to be mapped as a block mapping. However, this API doesn't really give un everything we want: - we don't get the size: this is not crucial today as we only support PMD-sized THPs, but we'd like to have larger sizes in the future - we're the only user left of the API, and there is a will to remove it altogether To address the above, implement a simple walker using the existing page table infrastructure, and plumb it into transparent_hugepage_adjust(). No new page sizes are supported in the process. Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Link: https://lore.kernel.org/r/20210726153552.1535838-3-maz@kernel.org --- arch/arm64/kvm/mmu.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0625bf2353c2..183c107c06b2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -433,6 +433,32 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, return 0; } +static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { + /* We shouldn't need any other callback to walk the PT */ + .phys_to_virt = kvm_host_va, +}; + +static int get_user_mapping_size(struct kvm *kvm, u64 addr) +{ + struct kvm_pgtable pgt = { + .pgd = (kvm_pte_t *)kvm->mm->pgd, + .ia_bits = VA_BITS, + .start_level = (KVM_PGTABLE_MAX_LEVELS - + CONFIG_PGTABLE_LEVELS), + .mm_ops = &kvm_user_mm_ops, + }; + kvm_pte_t pte = 0; /* Keep GCC quiet... */ + u32 level = ~0; + int ret; + + ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); + VM_BUG_ON(ret); + VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); + VM_BUG_ON(!(pte & PTE_VALID)); + + return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); +} + static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_host_zalloc_pages_exact, @@ -780,7 +806,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * Returns the size of the mapping. */ static unsigned long -transparent_hugepage_adjust(struct kvm_memory_slot *memslot, +transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) { @@ -791,8 +817,8 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot, * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ - if (kvm_is_transparent_hugepage(pfn) && - fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && + get_user_mapping_size(kvm, hva) >= PMD_SIZE) { /* * The address we faulted on is backed by a transparent huge * page. However, because we map the compound huge page and @@ -1051,7 +1077,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * backed by a THP and thus use block mapping if possible. */ if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) - vma_pagesize = transparent_hugepage_adjust(memslot, hva, + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &fault_ipa); if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { -- cgit v1.2.3 From f2cc327303b13a70311e823bd52aa0bca8c7ddbc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Jul 2021 16:35:49 +0100 Subject: KVM: arm64: Avoid mapping size adjustment on permission fault Since we only support PMD-sized mappings for THP, getting a permission fault on a level that results in a mapping being larger than PAGE_SIZE is a sure indication that we have already upgraded our mapping to a PMD. In this case, there is no need to try and parse userspace page tables, as the fault information already tells us everything. Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Link: https://lore.kernel.org/r/20210726153552.1535838-4-maz@kernel.org --- arch/arm64/kvm/mmu.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 183c107c06b2..116a2910a4a4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1076,9 +1076,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) - vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, - &pfn, &fault_ipa); + if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { + if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE) + vma_pagesize = fault_granule; + else + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, + hva, &pfn, + &fault_ipa); + } if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new VM_SHARED VMA */ -- cgit v1.2.3 From 0fe49630101b3ce23bd21a2788440ac719ec868a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Jul 2021 16:35:51 +0100 Subject: KVM: arm64: Use get_page() instead of kvm_get_pfn() When mapping a THP, we are guaranteed that the page isn't reserved, and we can safely avoid the kvm_is_reserved_pfn() call. Replace kvm_get_pfn() with get_page(pfn_to_page()). Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210726153552.1535838-6-maz@kernel.org --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 116a2910a4a4..6e002d30a478 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -840,7 +840,7 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, *ipap &= PMD_MASK; kvm_release_pfn_clean(pfn); pfn &= ~(PTRS_PER_PMD - 1); - kvm_get_pfn(pfn); + get_page(pfn_to_page(pfn)); *pfnp = pfn; return PMD_SIZE; -- cgit v1.2.3 From 63db506e07622c344a3c748a1c06293d48780f83 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Jul 2021 16:35:47 +0100 Subject: KVM: arm64: Introduce helper to retrieve a PTE and its level It is becoming a common need to fetch the PTE for a given address together with its level. Add such a helper. Signed-off-by: Marc Zyngier Reviewed-by: Quentin Perret Reviewed-by: Alexandru Elisei Link: https://lore.kernel.org/r/20210726153552.1535838-2-maz@kernel.org --- arch/arm64/include/asm/kvm_pgtable.h | 20 ++++++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 39 ++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index f004c0115d89..e42b55bd50a2 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -432,6 +432,26 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker); +/** + * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry + * with its level. + * @pgt: Page-table structure initialised by kvm_pgtable_*_init() + * or a similar initialiser. + * @addr: Input address for the start of the walk. + * @ptep: Pointer to storage for the retrieved PTE. + * @level: Pointer to storage for the level of the retrieved PTE. + * + * The offset of @addr within a page is ignored. + * + * The walker will walk the page-table entries corresponding to the input + * address specified, retrieving the leaf corresponding to this address. + * Invalid entries are treated as leaf entries. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, + kvm_pte_t *ptep, u32 *level); + /** * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical * Addresses with compatible permission diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 05321f4165e3..78f36bd5df6c 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -326,6 +326,45 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, return _kvm_pgtable_walk(&walk_data); } +struct leaf_walk_data { + kvm_pte_t pte; + u32 level; +}; + +static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + struct leaf_walk_data *data = arg; + + data->pte = *ptep; + data->level = level; + + return 0; +} + +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, + kvm_pte_t *ptep, u32 *level) +{ + struct leaf_walk_data data; + struct kvm_pgtable_walker walker = { + .cb = leaf_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &data, + }; + int ret; + + ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), + PAGE_SIZE, &walker); + if (!ret) { + if (ptep) + *ptep = data.pte; + if (level) + *level = data.level; + } + + return ret; +} + struct hyp_map_data { u64 phys; kvm_pte_t attr; -- cgit v1.2.3 From 0ab410a93d627ae73136d1a52c096262360b7992 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 19 Jul 2021 13:38:59 +0100 Subject: KVM: arm64: Narrow PMU sysreg reset values to architectural requirements A number of the PMU sysregs expose reset values that are not compliant with the architecture (set bits in the RES0 ranges, for example). This in turn has the effect that we need to pointlessly mask some register fields when using them. Let's start by making sure we don't have illegal values in the shadow registers at reset time. This affects all the registers that dedicate one bit per counter, the counters themselves, PMEVTYPERn_EL0 and PMSELR_EL0. Reported-by: Alexandre Chartre Reviewed-by: Alexandre Chartre Acked-by: Russell King (Oracle) Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Link: https://lore.kernel.org/r/20210719123902.1493805-2-maz@kernel.org --- arch/arm64/kvm/sys_regs.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f6f126eb6ac1..96bdfa0e68b2 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -603,6 +603,41 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); + + /* No PMU available, any PMU reg may UNDEF... */ + if (!kvm_arm_support_pmu_v3()) + return; + + n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; + n &= ARMV8_PMU_PMCR_N_MASK; + if (n) + mask |= GENMASK(n - 1, 0); + + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= mask; +} + +static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); +} + +static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK; +} + +static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; +} + static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 pmcr, val; @@ -944,16 +979,18 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr } #define PMU_SYS_REG(r) \ - SYS_DESC(r), .reset = reset_unknown, .visibility = pmu_visibility + SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ + .reset = reset_pmevcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ #define PMU_PMEVTYPER_EL0(n) \ { PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)), \ + .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -1595,13 +1632,13 @@ static const struct sys_reg_desc sys_reg_descs[] = { { PMU_SYS_REG(SYS_PMSWINC_EL0), .access = access_pmswinc, .reg = PMSWINC_EL0 }, { PMU_SYS_REG(SYS_PMSELR_EL0), - .access = access_pmselr, .reg = PMSELR_EL0 }, + .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, { PMU_SYS_REG(SYS_PMCEID0_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCCNTR_EL0), - .access = access_pmu_evcntr, .reg = PMCCNTR_EL0 }, + .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(SYS_PMXEVCNTR_EL0), -- cgit v1.2.3 From f5eff40058a856c23c5ec2f31756f107a2b1ef84 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 19 Jul 2021 13:39:00 +0100 Subject: KVM: arm64: Drop unnecessary masking of PMU registers We always sanitise our PMU sysreg on the write side, so there is no need to do it on the read side as well. Drop the unnecessary masking. Acked-by: Russell King (Oracle) Reviewed-by: Alexandre Chartre Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210719123902.1493805-3-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 3 +-- arch/arm64/kvm/sys_regs.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f33825c995cb..fae4e95b586c 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -373,7 +373,6 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); - reg &= kvm_pmu_valid_counter_mask(vcpu); } return reg; @@ -569,7 +568,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } else { kvm_pmu_disable_counter_mask(vcpu, mask); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 96bdfa0e68b2..f22139658e48 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -880,7 +880,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, kvm_pmu_disable_counter_mask(vcpu, val); } } else { - p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } return true; @@ -904,7 +904,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* accessing PMINTENCLR_EL1 */ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; } else { - p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } return true; @@ -926,7 +926,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* accessing PMOVSCLR_EL0 */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); } else { - p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); } return true; -- cgit v1.2.3 From ca4f202d08ba7f24cc97dce14c6d20ec7a679135 Mon Sep 17 00:00:00 2001 From: Alexandre Chartre Date: Mon, 19 Jul 2021 13:39:01 +0100 Subject: KVM: arm64: Disabling disabled PMU counters wastes a lot of time In a KVM guest on arm64, performance counters interrupts have an unnecessary overhead which slows down execution when using the "perf record" command and limits the "perf record" sampling period. The problem is that when a guest VM disables counters by clearing the PMCR_EL0.E bit (bit 0), KVM will disable all counters defined in PMCR_EL0 even if they are not enabled in PMCNTENSET_EL0. KVM disables a counter by calling into the perf framework, in particular by calling perf_event_create_kernel_counter() which is a time consuming operation. So, for example, with a Neoverse N1 CPU core which has 6 event counters and one cycle counter, KVM will always disable all 7 counters even if only one is enabled. This typically happens when using the "perf record" command in a guest VM: perf will disable all event counters with PMCNTENTSET_EL0 and only uses the cycle counter. And when using the "perf record" -F option with a high profiling frequency, the overhead of KVM disabling all counters instead of one on every counter interrupt becomes very noticeable. The problem is fixed by having KVM disable only counters which are enabled in PMCNTENSET_EL0. If a counter is not enabled in PMCNTENSET_EL0 then KVM will not enable it when setting PMCR_EL0.E and it will remain disabled as long as it is not enabled in PMCNTENSET_EL0. So there is effectively no need to disable a counter when clearing PMCR_EL0.E if it is not enabled PMCNTENSET_EL0. Acked-by: Russell King (Oracle) Reviewed-by: Alexandru Elisei Signed-off-by: Alexandre Chartre [maz: moved 'mask' close to the actual user, simplifying the patch] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210712170345.660272-1-alexandre.chartre@oracle.com Link: https://lore.kernel.org/r/20210719123902.1493805-4-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index fae4e95b586c..dc65b58dc68f 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -563,20 +563,21 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) */ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); int i; if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } else { - kvm_pmu_disable_counter_mask(vcpu, mask); + kvm_pmu_disable_counter_mask(vcpu, + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) kvm_pmu_set_counter_value(vcpu, i, 0); -- cgit v1.2.3 From 7a3ba3095a32f9c4ec8f30d680fea5150e12c3f3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 19 Jul 2021 13:39:02 +0100 Subject: KVM: arm64: Remove PMSWINC_EL0 shadow register We keep an entry for the PMSWINC_EL0 register in the vcpu structure, while *never* writing anything there outside of reset. Given that the register is defined as write-only, that we always trap when this register is accessed, there is little point in saving anything anyway. Get rid of the entry, and save a mighty 8 bytes per vcpu structure. We still need to keep it exposed to userspace in order to preserve backward compatibility with previously saved VMs. Since userspace cannot expect any effect of writing to PMSWINC_EL0, treat the register as RAZ/WI for the purpose of userspace access. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210719123902.1493805-5-maz@kernel.org --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/sys_regs.c | 21 ++++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 41911585ae0c..afc169630884 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -185,7 +185,6 @@ enum vcpu_sysreg { PMCNTENSET_EL0, /* Count Enable Set Register */ PMINTENSET_EL1, /* Interrupt Enable Set Register */ PMOVSSET_EL0, /* Overflow Flag Status Set Register */ - PMSWINC_EL0, /* Software Increment Register */ PMUSERENR_EL0, /* User Enable Register */ /* Pointer Authentication Registers in a strict increasing order. */ diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f22139658e48..a1f5101f49a3 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1286,6 +1286,20 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(vcpu, rd, uaddr, true); } +static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + int err; + u64 val; + + /* Perform the access even if we are going to ignore the value */ + err = reg_from_user(&val, uaddr, sys_reg_to_index(rd)); + if (err) + return err; + + return 0; +} + static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -1629,8 +1643,13 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmcnten, .reg = PMCNTENSET_EL0 }, { PMU_SYS_REG(SYS_PMOVSCLR_EL0), .access = access_pmovs, .reg = PMOVSSET_EL0 }, + /* + * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was + * previously (and pointlessly) advertised in the past... + */ { PMU_SYS_REG(SYS_PMSWINC_EL0), - .access = access_pmswinc, .reg = PMSWINC_EL0 }, + .get_user = get_raz_id_reg, .set_user = set_wi_reg, + .access = access_pmswinc, .reset = NULL }, { PMU_SYS_REG(SYS_PMSELR_EL0), .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, { PMU_SYS_REG(SYS_PMCEID0_EL0), -- cgit v1.2.3 From 673692735fdc40ed7da32c0cb3517adaf4227b2b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jul 2021 15:04:25 -0700 Subject: KVM: x86: Use KVM_BUG/KVM_BUG_ON to handle bugs that are fatal to the VM Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Reviewed-by: Paolo Bonzini Message-Id: <0e8760a26151f47dc47052b25ca8b84fffe0641e.1625186503.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 23 ++++++++++++++--------- arch/x86/kvm/x86.c | 4 ++++ 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e8ccab50ebf6..4ce6d827fccd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1560,7 +1560,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); } } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 927a552393b9..fb1ac33a9902 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2274,7 +2274,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); break; } } @@ -4996,6 +4996,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, err); case 3: WARN_ON_ONCE(enable_unrestricted_guest); + err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: @@ -5021,14 +5022,13 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 2: /* clts */ - WARN_ONCE(1, "Guest should always own CR0.TS"); - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - return kvm_skip_emulated_instruction(vcpu); + KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); + return -EIO; case 1: /*mov from cr*/ switch (cr) { case 3: WARN_ON_ONCE(enable_unrestricted_guest); + val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); @@ -5338,7 +5338,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { - WARN_ON_ONCE(!enable_vnmi); + if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) + return -EIO; + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5896,7 +5898,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * below) should never happen as that means we incorrectly allowed a * nested VM-Enter with an invalid vmcs12. */ - WARN_ON_ONCE(vmx->nested.nested_run_pending); + if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) + return -EIO; /* If guest state is invalid, start emulating */ if (vmx->emulation_required) @@ -6274,7 +6277,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) int max_irr; bool max_irr_updated; - WARN_ON(!vcpu->arch.apicv_active); + if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + return -EIO; + if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* @@ -6357,7 +6362,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; gate_desc *desc = (gate_desc *)host_idt_base + vector; - if (WARN_ONCE(!is_external_intr(intr_info), + if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5d5c5ed7dd4..338844b82c4c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9395,6 +9395,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + r = -EIO; + goto out; + } if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; -- cgit v1.2.3 From 19025e7bc5977b35c73bea99841245f93470105e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jul 2021 15:04:26 -0700 Subject: KVM: x86/mmu: Mark VM as bugged if page fault returns RET_PF_INVALID Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Reviewed-by: Paolo Bonzini Message-Id: <298980aa5fc5707184ac082287d13a800cd9c25f.1625186503.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 66f7f5bc3482..ce1d955a0536 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5134,7 +5134,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - if (WARN_ON_ONCE(r == RET_PF_INVALID)) + if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } -- cgit v1.2.3 From e489a4a6bddb83385cb3d896776c5a7b2ec653d8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jul 2021 15:04:29 -0700 Subject: KVM: x86: Hoist kvm_dirty_regs check out of sync_regs() Move the kvm_dirty_regs vs. KVM_SYNC_X86_VALID_FIELDS check out of sync_regs() and into its sole caller, kvm_arch_vcpu_ioctl_run(). This allows a future patch to allow synchronizing select state for protected VMs. Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Reviewed-by: Paolo Bonzini Message-Id: <889017a8d31cea46472e0c64b234ef5919278ed9.1625186503.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 338844b82c4c..e906c05e530e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9980,7 +9980,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || + (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { r = -EINVAL; goto out; } @@ -10585,9 +10586,6 @@ static void store_regs(struct kvm_vcpu *vcpu) static int sync_regs(struct kvm_vcpu *vcpu) { - if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) - return -EINVAL; - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; -- cgit v1.2.3 From 03fffc5493c8c8d850586df5740c985026c313bf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jul 2021 15:04:50 -0700 Subject: KVM: x86/mmu: Refactor shadow walk in __direct_map() to reduce indentation Employ a 'continue' to reduce the indentation for linking a new shadow page during __direct_map() in preparation for linking private pages. Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Reviewed-by: Paolo Bonzini Message-Id: <702419686d5700373123f6ea84e7a946c2cad8b4.1625186503.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ce1d955a0536..85c70492f863 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2939,15 +2939,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, break; drop_large_spte(vcpu, it.sptep); - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - - link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + if (is_shadow_present_pte(*it.sptep)) + continue; + + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); + + link_shadow_page(vcpu, it.sptep, sp); + if (is_tdp && huge_page_disallowed && + req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, -- cgit v1.2.3 From 7fa2a347512ab06ea1558302564879c060d52b0c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jul 2021 15:04:51 -0700 Subject: KVM: x86/mmu: Return old SPTE from mmu_spte_clear_track_bits() Return the old SPTE when clearing a SPTE and push the "old SPTE present" check to the caller. Private shadow page support will use the old SPTE in rmap_remove() to determine whether or not there is a linked private shadow page. Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Reviewed-by: Paolo Bonzini Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 85c70492f863..d9e3ac6dda05 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -592,9 +592,9 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. - * Returns non-zero if the PTE was previously valid. + * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(u64 *sptep) +static u64 mmu_spte_clear_track_bits(u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; @@ -605,7 +605,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) old_spte = __update_clear_spte_slow(sptep, 0ull); if (!is_shadow_present_pte(old_spte)) - return 0; + return old_spte; pfn = spte_to_pfn(old_spte); @@ -622,7 +622,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); - return 1; + return old_spte; } /* @@ -1119,7 +1119,9 @@ out: static void drop_spte(struct kvm *kvm, u64 *sptep) { - if (mmu_spte_clear_track_bits(sptep)) + u64 old_spte = mmu_spte_clear_track_bits(sptep); + + if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); } -- cgit v1.2.3 From ec1cf69c376970f42761e641cf5074b84f8db243 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Jun 2021 11:32:06 -0400 Subject: KVM: X86: Add per-vm stat for max rmap list size Add a new statistic max_mmu_rmap_size, which stores the maximum size of rmap for the vm. Signed-off-by: Peter Xu Message-Id: <20210625153214.43106-2-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 2 ++ arch/x86/kvm/x86.c | 1 + 3 files changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 974cbfb1eefe..d798650ad793 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1209,6 +1209,7 @@ struct kvm_vm_stat { u64 lpages; u64 nx_lpage_splits; u64 max_mmu_page_hash_collisions; + u64 max_mmu_rmap_size; }; struct kvm_vcpu_stat { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d9e3ac6dda05..0b2954b5fbc6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2698,6 +2698,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (is_shadow_present_pte(*sptep)) { if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); + if (rmap_count > vcpu->kvm->stat.max_mmu_rmap_size) + vcpu->kvm->stat.max_mmu_rmap_size = rmap_count; if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e906c05e530e..ab200f2cb1f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -235,6 +235,7 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, mmu_unsync), STATS_DESC_ICOUNTER(VM, lpages), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == -- cgit v1.2.3 From 38f703663d4c82ead5b51b8860deeef19d6dcb6d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 27 Jul 2021 12:32:51 +0200 Subject: KVM: arm64: Count VMID-wide TLB invalidations KVM/ARM has an architecture-specific implementation of kvm_flush_remote_tlbs; however, unlike the generic one, it does not count the flushes in kvm->stat.remote_tlb_flush, so that it inexorably remained stuck to zero. Signed-off-by: Paolo Bonzini Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210727103251.16561-1-pbonzini@redhat.com --- arch/arm64/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0625bf2353c2..2ca0a494a111 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -81,6 +81,7 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) void kvm_flush_remote_tlbs(struct kvm *kvm) { kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); + ++kvm->stat.generic.remote_tlb_flush; } static bool kvm_is_device_pfn(unsigned long pfn) -- cgit v1.2.3 From 013cc4c6788f1ce9885d3c0281904f93ee8f2271 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 28 Jul 2021 21:06:23 +0800 Subject: KVM: arm64: Fix comments related to GICv2 PMR reporting Remove the repeated word 'the' from two comments. Signed-off-by: Jason Wang Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210728130623.12017-1-wangborong@cdjrlc.com --- arch/arm64/kvm/vgic/vgic-mmio-v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index a016f07adc28..5f9014ae595b 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -282,7 +282,7 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, case GIC_CPU_PRIMASK: /* * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than + * PMR field as GICH_VMCR.VMPriMask rather than * GICC_PMR.Priority, so we expose the upper five bits of * priority mask to userspace using the lower bits in the * unsigned long. @@ -329,7 +329,7 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, case GIC_CPU_PRIMASK: /* * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than + * PMR field as GICH_VMCR.VMPriMask rather than * GICC_PMR.Priority, so we expose the upper five bits of * priority mask to userspace using the lower bits in the * unsigned long. -- cgit v1.2.3 From 1694caef426247bb406c2b04672336ef77b5fb87 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 1 Jul 2021 17:41:01 +0200 Subject: x86/kvm: remove non-x86 stuff from arch/x86/kvm/ioapic.h The file has been moved to arch/x86 long time ago. Time to get rid of non-x86 stuff. Signed-off-by: Juergen Gross Message-Id: <20210701154105.23215-3-jgross@suse.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 11e4065e1617..bbd4a5d18b5d 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -35,11 +35,7 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 -#ifdef CONFIG_X86 #define RTC_GSI 8 -#else -#define RTC_GSI -1U -#endif struct dest_map { /* vcpu bitmap where IRQ has been sent */ -- cgit v1.2.3 From 76cd325ea75bb2a84c329782d7b8015b6a970c34 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 13 Jul 2021 22:09:52 +0000 Subject: KVM: x86/mmu: Rename cr2_or_gpa to gpa in fast_page_fault fast_page_fault is only called from direct_page_fault where we know the address is a gpa. Fixes: 736c291c9f36 ("KVM: x86: Use gpa_t for cr2/gpa to fix TDP support on 32-bit KVM") Reviewed-by: Ben Gardon Reviewed-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210713220957.3493520-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0b2954b5fbc6..6f5910b7b9bc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3103,8 +3103,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -3120,7 +3119,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) + for_each_shadow_entry_lockless(vcpu, gpa, iterator, spte) if (!is_shadow_present_pte(spte)) break; @@ -3199,8 +3198,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); - trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, iterator.sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; -- cgit v1.2.3 From 61bcd360aa9851cec969b7b40f23fd1faa7f85a4 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 13 Jul 2021 22:09:53 +0000 Subject: KVM: x86/mmu: Fix use of enums in trace_fast_page_fault Enum values have to be exported to userspace since the formatting is not done in the kernel. Without doing this perf maps RET_PF_FIXED and RET_PF_SPURIOUS to 0, which results in incorrect output: $ perf record -a -e kvmmmu:fast_page_fault --filter "ret==3" -- ./access_tracking_perf_test $ perf script | head -1 [...] new 610006048d25877 spurious 0 fixed 0 <------ should be 1 Fix this by exporting the enum values to userspace with TRACE_DEFINE_ENUM. Fixes: c4371c2a682e ("KVM: x86/mmu: Return unique RET_PF_* values if the fault was fixed") Signed-off-by: David Matlack Message-Id: <20210713220957.3493520-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu_internal.h | 3 +++ arch/x86/kvm/mmu/mmutrace.h | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 35567293c1fd..626cb848dab4 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -140,6 +140,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + * + * Any names added to this enum should be exported to userspace for use in + * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h */ enum { RET_PF_RETRY = 0, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index efbad33a0645..2924a4081a19 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -54,6 +54,12 @@ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } +TRACE_DEFINE_ENUM(RET_PF_RETRY); +TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_FIXED); +TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); + /* * A pagetable walk has started */ -- cgit v1.2.3 From c5c8c7c53004cb70715320018c3b4287071c1cfd Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 13 Jul 2021 22:09:54 +0000 Subject: KVM: x86/mmu: Make walk_shadow_page_lockless_{begin,end} interoperate with the TDP MMU Acquire the RCU read lock in walk_shadow_page_lockless_begin and release it in walk_shadow_page_lockless_end when the TDP MMU is enabled. This should not introduce any functional changes but is used in the following commit to make fast_page_fault interoperate with the TDP MMU. Signed-off-by: David Matlack Message-Id: <20210713220957.3493520-4-dmatlack@google.com> [Use if...else instead of if(){return;}] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 52 +++++++++++++++++++++++++++------------------- arch/x86/kvm/mmu/tdp_mmu.c | 6 ++---- arch/x86/kvm/mmu/tdp_mmu.h | 10 +++++++++ 3 files changed, 43 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6f5910b7b9bc..d5b0c8b0e9e9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -686,28 +686,36 @@ static bool mmu_spte_age(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - /* - * Prevent page table teardown by making any free-er wait during - * kvm_flush_remote_tlbs() IPI to all active vcpus. - */ - local_irq_disable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_begin(); + } else { + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); - /* - * Make sure a following spte read is not reordered ahead of the write - * to vcpu->mode. - */ - smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* - * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us - * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. - */ - smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); - local_irq_enable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_end(); + } else { + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) @@ -3617,6 +3625,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { @@ -3624,8 +3634,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); @@ -3639,8 +3647,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level break; } - walk_shadow_page_lockless_end(vcpu); - return leaf; } @@ -3652,11 +3658,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; + walk_shadow_page_lockless_begin(vcpu); + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); + walk_shadow_page_lockless_end(vcpu); + if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 0853370bd811..228f0cc5e2cf 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1516,6 +1516,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) @@ -1527,14 +1529,10 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; - rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } - rcu_read_unlock(); - return leaf; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1cae4485b3bc..93e1bf5089c4 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -77,6 +77,16 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); +static inline void kvm_tdp_mmu_walk_lockless_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_tdp_mmu_walk_lockless_end(void) +{ + rcu_read_unlock(); +} + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); -- cgit v1.2.3 From 6e8eb2060cc7fbc4d388d0ab70502e265aec98c6 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 13 Jul 2021 22:09:55 +0000 Subject: KVM: x86/mmu: fast_page_fault support for the TDP MMU Make fast_page_fault interoperate with the TDP MMU by leveraging walk_shadow_page_lockless_{begin,end} to acquire the RCU read lock and introducing a new helper function kvm_tdp_mmu_fast_pf_get_last_sptep to grab the lowest level sptep. Suggested-by: Ben Gardon Signed-off-by: David Matlack Message-Id: <20210713220957.3493520-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 50 ++++++++++++++++++++++++++++++++++------------ arch/x86/kvm/mmu/tdp_mmu.c | 41 +++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 2 ++ 3 files changed, 80 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d5b0c8b0e9e9..29010abb659c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3108,15 +3108,41 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) return spte & PT_PRESENT_MASK; } +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between walk_shadow_page_lockless_{begin,end}. + * - The returned sptep must not be used after walk_shadow_page_lockless_end. + */ +static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) +{ + struct kvm_shadow_walk_iterator iterator; + u64 old_spte; + u64 *sptep = NULL; + + for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { + sptep = iterator.sptep; + *spte = old_spte; + + if (!is_shadow_present_pte(old_spte)) + break; + } + + return sptep; +} + /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) { - struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte = 0ull; + u64 *sptep = NULL; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) @@ -3127,14 +3153,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, gpa, iterator, spte) - if (!is_shadow_present_pte(spte)) - break; + if (is_tdp_mmu(vcpu->arch.mmu)) + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + else + sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); if (!is_shadow_present_pte(spte)) break; - sp = sptep_to_sp(iterator.sptep); + sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3192,8 +3219,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3206,7 +3232,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) } while (true); - trace_fast_page_fault(vcpu, gpa, error_code, iterator.sptep, spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3841,11 +3867,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_fault) { - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; - } + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; r = mmu_topup_memory_caches(vcpu, false); if (r) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 228f0cc5e2cf..88aff5d0512c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -527,6 +527,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, if (is_removed_spte(iter->old_spte)) return false; + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. + */ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; @@ -1536,3 +1540,40 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } + +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. + * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. + * + * WARNING: This function is only intended to be called during fast_page_fault. + */ +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + gfn_t gfn = addr >> PAGE_SHIFT; + tdp_ptep_t sptep = NULL; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + *spte = iter.old_spte; + sptep = iter.sptep; + } + + /* + * Perform the rcu_dereference to get the raw spte pointer value since + * we are passing it up to fast_page_fault, which is shared with the + * legacy MMU and thus does not retain the TDP MMU-specific __rcu + * annotation. + * + * This is safe since fast_page_fault obeys the contracts of this + * function as well as all TDP MMU contracts around modifying SPTEs + * outside of mmu_lock. + */ + return rcu_dereference(sptep); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 93e1bf5089c4..361b47f98cc5 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -89,6 +89,8 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void) int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte); #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); -- cgit v1.2.3 From 71ba3f3189c78f756a659568fb473600fd78f207 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 26 Jul 2021 12:30:52 -0400 Subject: KVM: x86: enable TDP MMU by default With the addition of fast page fault support, the TDP-specific MMU has reached feature parity with the original MMU. All my testing in the last few months has been done with the TDP MMU; switch the default on 64-bit machines. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 88aff5d0512c..41cee1d22918 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -10,7 +10,7 @@ #include #include -static bool __read_mostly tdp_mmu_enabled = false; +static bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ -- cgit v1.2.3 From df63202fe52bd1583ce1418c755a81d6d41af3b9 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 13 Jul 2021 17:20:19 +0300 Subject: KVM: x86: APICv: drop immediate APICv disablement on current vCPU Special case of disabling the APICv on the current vCPU right away in kvm_request_apicv_update doesn't bring much benefit vs raising KVM_REQ_APICV_UPDATE on it instead, since this request will be processed on the next entry to the guest. (the comment about having another #VMEXIT is wrong). It also hides various assumptions that APIVc enable state matches the APICv inhibit state, as this special case only makes those states match on the current vCPU. Previous patches fixed few such assumptions so now it should be safe to drop this special case. Signed-off-by: Maxim Levitsky Message-Id: <20210713142023.106183-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ab200f2cb1f0..54fdadacb27f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9264,7 +9264,6 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - struct kvm_vcpu *except; unsigned long old, new, expected; if (!kvm_x86_ops.check_apicv_inhibit_reasons || @@ -9290,16 +9289,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) if (kvm_x86_ops.pre_update_apicv_exec_ctrl) static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); - /* - * Sending request to update APICV for all other vcpus, - * while update the calling vcpu immediately instead of - * waiting for another #VMEXIT to handle the request. - */ - except = kvm_get_running_vcpu(); - kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, - except); - if (except) - kvm_vcpu_update_apicv(except); + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); -- cgit v1.2.3 From df37ed38e6c26064d5f97ebbe5cacc75eeb89153 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:39 -0700 Subject: KVM: x86: Flush the guest's TLB on INIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flush the guest's TLB on INIT, as required by Intel's SDM. Although AMD's APM states that the TLBs are unchanged by INIT, it's not clear that that's correct as the APM also states that the TLB is flush on "External initialization of the processor." Regardless, relying on the guest to be paranoid is unnecessarily risky, while an unnecessary flush is benign from a functional perspective and likely has no measurable impact on guest performance. Note, as of the April 2021 version of Intels' SDM, it also contradicts itself with respect to TLB flushing. The overview of INIT explicitly calls out the TLBs as being invalidated, while a table later in the same section says they are unchanged. 9.1 INITIALIZATION OVERVIEW: The major difference is that during an INIT, the internal caches, MSRs, MTRRs, and x87 FPU state are left unchanged (although, the TLBs and BTB are invalidated as with a hardware reset) Table 9-1: Register Power up Reset INIT Data and Code Cache, TLBs: Invalid[6] Invalid[6] Unchanged Given Core2's erratum[*] about global TLB entries not being flush on INIT, it's safe to assume that the table is simply wrong. AZ28. INIT Does Not Clear Global Entries in the TLB Problem: INIT may not flush a TLB entry when: • The processor is in protected mode with paging enabled and the page global enable flag is set (PGE bit of CR4 register) • G bit for the page table entry is set • TLB entry is present in TLB when INIT occurs • Software may encounter unexpected page fault or incorrect address translation due to a TLB entry erroneously left in TLB after INIT. Workaround: Write to CR3, CR4 (setting bits PSE, PGE or PAE) or CR0 (setting bits PG or PE) registers before writing to memory early in BIOS code to clear all the global entries from TLB. Status: For the steppings affected, see the Summary Tables of Changes. [*] https://www.intel.com/content/dam/support/us/en/documents/processors/mobile/celeron/sb/320121.pdf Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface") Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54fdadacb27f..164ecca7769e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10872,6 +10872,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (old_cr0 & X86_CR0_PG) kvm_mmu_reset_context(vcpu); + + /* + * Intel's SDM states that all TLB entries are flushed on INIT. AMD's + * APM states the TLBs are untouched by INIT, but it also states that + * the TLBs are flushed on "External initialization of the processor." + * Flush the guest TLB regardless of vendor, there is no meaningful + * benefit in relying on the guest to flush the TLB immediately after + * INIT. A spurious TLB flush is benign and likely negligible from a + * performance perspective. + */ + if (init_event) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) -- cgit v1.2.3 From afc8de0118be84f4058b9977d481aeb3e0758dbb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:40 -0700 Subject: KVM: nVMX: Set LDTR to its architecturally defined value on nested VM-Exit Set L1's LDTR on VM-Exit per the Intel SDM: The host-state area does not contain a selector field for LDTR. LDTR is established as follows on all VM exits: the selector is cleared to 0000H, the segment is marked unusable and is otherwise undefined (although the base address is always canonical). This is likely a benign bug since the LDTR is unusable, as it means the L1 VMM is conditioned to reload its LDTR in order to function properly on bare metal. Fixes: 4704d0befb07 ("KVM: nVMX: Exiting from L2 to L1") Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a52134b0c42..7f8184f432b4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4298,6 +4298,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, }; vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + memset(&seg, 0, sizeof(seg)); + seg.unusable = 1; + vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); + kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); -- cgit v1.2.3 From 4f117ce4aefca0e90cd44680219d4c261c1381b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:41 -0700 Subject: KVM: SVM: Zero out GDTR.base and IDTR.base on INIT Explicitly set GDTR.base and IDTR.base to zero when intializing the VMCB. Functionally this only affects INIT, as the bases are implicitly set to zero on RESET by virtue of the VMCB being zero allocated. Per AMD's APM, GDTR.base and IDTR.base are zeroed after RESET and INIT. Fixes: 04d2cc7780d4 ("KVM: Move main vcpu loop into subarch independent code") Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4ce6d827fccd..7845232b6fb6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1241,7 +1241,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; save->cs.limit = 0xffff; + save->gdtr.base = 0; save->gdtr.limit = 0xffff; + save->idtr.base = 0; save->idtr.limit = 0xffff; init_sys_seg(&save->ldtr, SEG_TYPE_LDT); -- cgit v1.2.3 From 2a24be79b6b7061a486239c3a3489eb67b9587f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:42 -0700 Subject: KVM: VMX: Set EDX at INIT with CPUID.0x1, Family-Model-Stepping Set EDX at RESET/INIT based on the userspace-defined CPUID model when possible, i.e. when CPUID.0x1.EAX is defind by userspace. At RESET/INIT, all CPUs that support CPUID set EDX to the FMS enumerated in CPUID.0x1.EAX. If no CPUID match is found, fall back to KVM's default of 0x600 (Family '6'), which is the least awful approximation of KVM's virtual CPU model. Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface") Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fb1ac33a9902..6946a5161d38 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4394,6 +4394,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct msr_data apic_base_msr; + u32 eax, dummy; u64 cr0; vmx->rmode.vm86_active = 0; @@ -4401,7 +4402,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->msr_ia32_umwait_control = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + eax = 1; + if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) + eax = get_rdx_init_val(); + kvm_rdx_write(vcpu, eax); + vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); -- cgit v1.2.3 From 067a456d091d05fdae32cae350410d905968b645 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:43 -0700 Subject: KVM: SVM: Require exact CPUID.0x1 match when stuffing EDX at INIT Do not allow an inexact CPUID "match" when querying the guest's CPUID.0x1 to stuff EDX during INIT. In the common case, where the guest CPU model is an AMD variant, allowing an inexact match is a nop since KVM doesn't emulate Intel's goofy "out-of-range" logic for AMD and Hygon. If the vCPU model happens to be an Intel variant, an inexact match is possible if and only if the max CPUID leaf is precisely '0'. Aside from the fact that there's probably no CPU in existence with a single CPUID leaf, if the max CPUID leaf is '0', that means that CPUID.0.EAX is '0', and thus an inexact match for CPUID.0x1.EAX will also yield '0'. So, with lots of twisty logic, no functional change intended. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7845232b6fb6..f4b6fec6252f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1346,7 +1346,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) } init_vmcb(vcpu); - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); kvm_rdx_write(vcpu, eax); if (kvm_vcpu_apicv_active(vcpu) && !init_event) -- cgit v1.2.3 From 665f4d9238ad83c36dd4e078ccab45b3ddec211d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:44 -0700 Subject: KVM: SVM: Fall back to KVM's hardcoded value for EDX at RESET/INIT At vCPU RESET/INIT (mostly RESET), stuff EDX with KVM's hardcoded, default Family-Model-Stepping ID of 0x600 if CPUID.0x1 isn't defined. At RESET, the CPUID lookup is guaranteed to "miss" because KVM emulates RESET before exposing the vCPU to userspace, i.e. userspace can't possibly have done set the vCPU's CPUID model, and thus KVM will always write '0'. At INIT, using 0x600 is less bad than using '0'. While initializing EDX to '0' is _extremely_ unlikely to be noticed by the guest, let alone break the guest, and can be overridden by userspace for the RESET case, using 0x600 is preferable as it will allow consolidating the relevant VMX and SVM RESET/INIT logic in the future. And, digging through old specs suggests that neither Intel nor AMD have ever shipped a CPU that initialized EDX to '0' at RESET. Regarding 0x600 as KVM's default Family, it is a sane default and in many ways the most appropriate. Prior to the 386 implementations, DX was undefined at RESET. With the 386, 486, 586/P5, and 686/P6/Athlon, both Intel and AMD set EDX to 3, 4, 5, and 6 respectively. AMD switched to using '15' as its primary Family with the introduction of AMD64, but Intel has continued using '6' for the last few decades. So, '6' is a valid Family for both Intel and AMD CPUs, is compatible with both 32-bit and 64-bit CPUs (albeit not a perfect fit for 64-bit AMD), and of the common Families (3 - 6), is the best fit with respect to KVM's virtual CPU model. E.g. prior to the P6, Intel CPUs did not have a STI window. Modern operating systems, Linux included, rely on the STI window, e.g. for "safe halt", and KVM unconditionally assumes the virtual CPU has an STI window. Thus enumerating a Family ID of 3, 4, or 5 would be provably wrong. Opportunistically remove a stale comment. Fixes: 66f7b72e1171 ("KVM: x86: Make register state after reset conform to specification") Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f4b6fec6252f..05c1e60d829a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1264,7 +1264,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); save->cr4 = X86_CR4_PAE; - /* rdx = ?? */ if (npt_enabled) { /* Setup VMCB for Nested Paging */ @@ -1346,7 +1345,15 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) } init_vmcb(vcpu); - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); + /* + * Fall back to KVM's default Family/Model/Stepping if no CPUID match + * is found. Note, it's impossible to get a match at RESET since KVM + * emulates RESET before exposing the vCPU to userspace, i.e. it's + * impossible for kvm_cpuid() to find a valid entry on RESET. But, go + * through the motions in case that's ever remedied, and to be pedantic. + */ + if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) + eax = get_rdx_init_val(); kvm_rdx_write(vcpu, eax); if (kvm_vcpu_apicv_active(vcpu) && !init_event) -- cgit v1.2.3 From 61152cd907d59ffd6b0a9479b2fa3b3b7b080409 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:45 -0700 Subject: KVM: VMX: Remove explicit MMU reset in enter_rmode() Drop an explicit MMU reset when entering emulated real mode now that the vCPU INIT/RESET path correctly handles conditional MMU resets, e.g. if INIT arrives while the vCPU is in 64-bit mode. Note, while there are multiple other direct calls to vmx_set_cr0(), i.e. paths that change CR0 without invoking kvm_post_set_cr0(), only the INIT emulation can reach enter_rmode(). CLTS emulation only toggles CR.TS, VM-Exit (and late VM-Fail) emulation cannot architecturally transition to Real Mode, and VM-Enter to Real Mode is possible if and only if Unrestricted Guest is enabled (exposed to L1). This effectively reverts commit 8668a3c468ed ("KVM: VMX: Reset mmu context when entering real mode") Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6946a5161d38..207393a429d1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2852,8 +2852,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - - kvm_mmu_reset_context(vcpu); } int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) -- cgit v1.2.3 From 5d2d7e41e3b80f37ec8673825fae07ffe9f140c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:46 -0700 Subject: KVM: SVM: Drop explicit MMU reset at RESET/INIT Drop an explicit MMU reset in SVM's vCPU RESET/INIT flow now that the common x86 path correctly handles conditional MMU resets, e.g. if INIT arrives while the vCPU is in 64-bit mode. This reverts commit ebae871a509d ("kvm: svm: reset mmu on VCPU reset"). Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 05c1e60d829a..a9af9bff4755 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1261,7 +1261,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * It also updates the guest-visible cr0 value. */ svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - kvm_mmu_reset_context(vcpu); save->cr4 = X86_CR4_PAE; -- cgit v1.2.3 From c2f79a65b4b66681894ef7d7e3912ba55acc20d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:49 -0700 Subject: KVM: x86: WARN if the APIC map is dirty without an in-kernel local APIC WARN if KVM ends up in a state where it thinks its APIC map needs to be recalculated, but KVM is not emulating the local APIC. This is mostly to document KVM's "rules" in order to provide clarity in future cleanups. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba5a27879f1d..add4dd1e3528 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -192,6 +192,9 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return; + WARN_ONCE(!irqchip_in_kernel(kvm), + "Dirty APIC map without an in-kernel local APIC"); + mutex_lock(&kvm->arch.apic_map_lock); /* * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map -- cgit v1.2.3 From 549240e8e09e063b7ba44c9f8497e8499562a34c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:50 -0700 Subject: KVM: x86: Remove defunct BSP "update" in local APIC reset Remove a BSP APIC update in kvm_lapic_reset() that is a glorified and confusing nop. When the code was originally added, kvm_vcpu_is_bsp() queried kvm->arch.bsp_vcpu, i.e. the intent was to set the BSP bit in the BSP vCPU's APIC. But, stuffing the BSP bit at INIT was wrong since the guest can change its BSP(s); this was fixed by commit 58d269d8cccc ("KVM: x86: BSP in MSR_IA32_APICBASE is writable"). In other words, kvm_vcpu_is_bsp() is now purely a reflection of vcpu->arch.apic_base.MSR_IA32_APICBASE_BSP, thus the update will always set the current value and kvm_lapic_set_base() is effectively a nop if the new and old values match. The RESET case, which does need to stuff the BSP for the reset vCPU, is handled by vendor code (though this will soon be moved to common code). No functional change intended. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index add4dd1e3528..a24ce8fe93e5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2367,9 +2367,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); - if (kvm_vcpu_is_bsp(vcpu)) - kvm_lapic_set_base(vcpu, - vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { -- cgit v1.2.3 From 0214f6bbe564632adba299e38023d681c1bd68c5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:51 -0700 Subject: KVM: x86: Migrate the PIT only if vcpu0 is migrated, not any BSP Make vcpu0 the arbitrary owner of the PIT, as was intended when PIT migration was added by commit 2f5997140f22 ("KVM: migrate PIT timer"). The PIT was unintentionally turned into being owned by the BSP by commit c5af89b68abb ("KVM: Introduce kvm_vcpu_is_bsp() function."), and was then unintentionally converted to a shared ownership model when kvm_vcpu_is_bsp() was modified to check the APIC base MSR instead of hardcoding vcpu0 as the BSP. Functionally, this just means the PIT's hrtimer is migrated less often. The real motivation is to remove the usage of kvm_vcpu_is_bsp(), so that more legacy/broken crud can be removed in a future patch. Fixes: 58d269d8cccc ("KVM: x86: BSP in MSR_IA32_APICBASE is writable") Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/i8254.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a6e218c6140d..5a69cce4d72d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; - if (!kvm_vcpu_is_bsp(vcpu) || !pit) + /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */ + if (vcpu->vcpu_id || !pit) return; timer = &pit->pit_state.timer; -- cgit v1.2.3 From 01913c57c225a8301e9a53447507f310a4be22b6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:52 -0700 Subject: KVM: x86: Don't force set BSP bit when local APIC is managed by userspace Don't set the BSP bit in vcpu->arch.apic_base when the local APIC is managed by userspace. Forcing all vCPUs to be BSPs is non-sensical, and was dead code when it was added by commit 97222cc83163 ("KVM: Emulate local APIC in kernel"). At the time, kvm_lapic_set_base() was invoked if and only if the local APIC was in-kernel (and it couldn't be called before the vCPU created its APIC). kvm_lapic_set_base() eventually gained generic usage, but the latent bug escaped notice because the only true consumer would be the guest itself in the form of an explicit RDMSRs on APs. Out of Linux, SeaBIOS, and EDK2/OVMF, only OVMF consumes the BSP bit from the APIC_BASE MSR. For the vast majority of usage in OVMF, BSP confusion would be benign. OVMF's BSP election upon SMI rendezvous might be broken, but practically no one runs KVM with an out-of-kernel local APIC, let alone does so while utilizing SMIs with OVMF. Fixes: 97222cc83163 ("KVM: Emulate local APIC in kernel") Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a24ce8fe93e5..acb201d16b5e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2268,9 +2268,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) -- cgit v1.2.3 From 503bc49424df4802ca34e4e1a024381fd7ced80e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:53 -0700 Subject: KVM: x86: Set BSP bit in reset BSP vCPU's APIC base by default Set the BSP bit appropriately during local APIC "reset" instead of relying on vendor code to clean up at a later point. This is a step towards consolidating the local APIC, VMX, and SVM xAPIC initialization code. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-16-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index acb201d16b5e..0fb282b64c8f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2321,6 +2321,7 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; + u64 msr_val; int i; if (!apic) @@ -2330,8 +2331,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) hrtimer_cancel(&apic->lapic_timer.timer); if (!init_event) { - kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE); + msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + msr_val |= MSR_IA32_APICBASE_BSP; + kvm_lapic_set_base(vcpu, msr_val); kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); } kvm_apic_set_version(apic->vcpu); -- cgit v1.2.3 From f0428b3dcb2d7efe4b2a2304841645b48f0b6499 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:54 -0700 Subject: KVM: VMX: Stuff vcpu->arch.apic_base directly at vCPU RESET Write vcpu->arch.apic_base directly instead of bouncing through kvm_set_apic_base(). This is a glorified nop, and is a step towards cleaning up the mess that is local APIC creation. When using an in-kernel APIC, kvm_create_lapic() explicitly sets vcpu->arch.apic_base to MSR_IA32_APICBASE_ENABLE to avoid its own kvm_lapic_set_base() call in kvm_lapic_reset() from triggering state changes. That call during RESET exists purely to set apic->base_address to the default base value. As a result, by the time VMX gets control, the only missing piece is the BSP bit being set for the reset BSP. For a userspace APIC, there are no side effects to process (for the APIC). In both cases, the call to kvm_update_cpuid_runtime() is a nop because the vCPU hasn't yet been exposed to userspace, i.e. there can't be any CPUID entries. No functional change intended. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-17-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 207393a429d1..2fc232e1bc20 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4391,7 +4391,6 @@ static void init_vmcs(struct vcpu_vmx *vmx) static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct msr_data apic_base_msr; u32 eax, dummy; u64 cr0; @@ -4409,12 +4408,10 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_set_cr8(vcpu, 0); if (!init_event) { - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(vcpu, &apic_base_msr); + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; } vmx_segment_cache_clear(vmx); -- cgit v1.2.3 From 421221234ada41b4a9f0beeb08e30b07388bd4bd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:55 -0700 Subject: KVM: x86: Open code necessary bits of kvm_lapic_set_base() at vCPU RESET Stuff vcpu->arch.apic_base and apic->base_address directly during APIC reset, as opposed to bouncing through kvm_set_apic_base() while fudging the ENABLE bit during creation to avoid the other, unwanted side effects. This is a step towards consolidating the APIC RESET logic across x86, VMX, and SVM. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-18-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0fb282b64c8f..295a9d02a9a5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2321,7 +2321,6 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; - u64 msr_val; int i; if (!apic) @@ -2331,10 +2330,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) hrtimer_cancel(&apic->lapic_timer.timer); if (!init_event) { - msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) - msr_val |= MSR_IA32_APICBASE_BSP; - kvm_lapic_set_base(vcpu, msr_val); + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + + apic->base_address = APIC_DEFAULT_PHYS_BASE; + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); } kvm_apic_set_version(apic->vcpu); @@ -2477,11 +2479,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } - /* - * APIC is created enabled. This will prevent kvm_lapic_set_base from - * thinking that APIC state has changed. - */ - vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); -- cgit v1.2.3 From 4547700a4d190ac419abffa317069aaccf1ac118 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:56 -0700 Subject: KVM: x86: Consolidate APIC base RESET initialization code Consolidate the APIC base RESET logic, which is currently spread out across both x86 and vendor code. For an in-kernel APIC, the vendor code is redundant. But for a userspace APIC, KVM relies on the vendor code to initialize vcpu->arch.apic_base. Hoist the vcpu->arch.apic_base initialization above the !apic check so that it applies to both flavors of APIC emulation, and delete the vendor code. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-19-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 12 +++++++----- arch/x86/kvm/svm/svm.c | 6 ------ arch/x86/kvm/vmx/vmx.c | 7 ------- 3 files changed, 7 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 295a9d02a9a5..76fb00921203 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2323,6 +2323,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) struct kvm_lapic *apic = vcpu->arch.apic; int i; + if (!init_event) { + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + } + if (!apic) return; @@ -2330,11 +2337,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) hrtimer_cancel(&apic->lapic_timer.timer); if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - apic->base_address = APIC_DEFAULT_PHYS_BASE; kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a9af9bff4755..acb7bd14110b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1336,12 +1336,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - } init_vmcb(vcpu); /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2fc232e1bc20..e487f10e2877 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4407,13 +4407,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - } - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); -- cgit v1.2.3 From 49d8665cc20b973995b5f519222d3270daa82f3f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:57 -0700 Subject: KVM: x86: Move EDX initialization at vCPU RESET to common code Move the EDX initialization at vCPU RESET, which is now identical between VMX and SVM, into common code. No functional change intended. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-20-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 ----- arch/x86/kvm/svm/svm.c | 13 ------------- arch/x86/kvm/vmx/vmx.c | 6 ------ arch/x86/kvm/x86.c | 13 +++++++++++++ 4 files changed, 13 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d798650ad793..ec8e4aca69c8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1773,11 +1773,6 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline u32 get_rdx_init_val(void) -{ - return 0x600; /* P6 family */ -} - static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index acb7bd14110b..f42ec7d4b0cd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1330,25 +1330,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); - u32 dummy; - u32 eax = 1; svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; init_vmcb(vcpu); - /* - * Fall back to KVM's default Family/Model/Stepping if no CPUID match - * is found. Note, it's impossible to get a match at RESET since KVM - * emulates RESET before exposing the vCPU to userspace, i.e. it's - * impossible for kvm_cpuid() to find a valid entry on RESET. But, go - * through the motions in case that's ever remedied, and to be pedantic. - */ - if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) - eax = get_rdx_init_val(); - kvm_rdx_write(vcpu, eax); - if (kvm_vcpu_apicv_active(vcpu) && !init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e487f10e2877..ad41f3423acf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4391,7 +4391,6 @@ static void init_vmcs(struct vcpu_vmx *vmx) static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 eax, dummy; u64 cr0; vmx->rmode.vm86_active = 0; @@ -4399,11 +4398,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->msr_ia32_umwait_control = 0; - eax = 1; - if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) - eax = get_rdx_init_val(); - kvm_rdx_write(vcpu, eax); - vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 164ecca7769e..ed8b6d1b4e85 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10792,6 +10792,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + u32 eax, dummy; kvm_lapic_reset(vcpu, init_event); @@ -10858,6 +10859,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + /* + * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) + * if no CPUID match is found. Note, it's impossible to get a match at + * RESET since KVM emulates RESET before exposing the vCPU to userspace, + * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. + * But, go through the motions in case that's ever remedied. + */ + eax = 1; + if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) + eax = 0x600; + kvm_rdx_write(vcpu, eax); + vcpu->arch.ia32_xss = 0; static_call(kvm_x86_vcpu_reset)(vcpu, init_event); -- cgit v1.2.3 From 9e90e215d9c9f013cc354c4fe1a1313531a97a05 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:58 -0700 Subject: KVM: SVM: Don't bother writing vmcb->save.rip at vCPU RESET/INIT Drop unnecessary initialization of vmcb->save.rip during vCPU RESET/INIT, as svm_vcpu_run() unconditionally propagates VCPU_REGS_RIP to save.rip. No true functional change intended. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-21-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f42ec7d4b0cd..9007adeb01fb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1253,8 +1253,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_efer(vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - save->rip = 0x0000fff0; - vcpu->arch.regs[VCPU_REGS_RIP] = save->rip; + vcpu->arch.regs[VCPU_REGS_RIP] = 0x0000fff0; /* * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. -- cgit v1.2.3 From ee5a5584cba316bc90bc2fad0c6d10b71f1791cb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:32:59 -0700 Subject: KVM: VMX: Invert handling of CR0.WP for EPT without unrestricted guest Opt-in to forcing CR0.WP=1 for shadow paging, and stop lying about WP being "always on" for unrestricted guest. In addition to making KVM a wee bit more honest, this paves the way for additional cleanup. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ad41f3423acf..2e84c31efb81 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -136,8 +136,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -2995,9 +2994,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, - struct kvm_vcpu *vcpu) +static void ept_update_paging_mode_cr0(unsigned long cr0, struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3016,9 +3013,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } - - if (!(cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; } void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) @@ -3031,6 +3025,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + if (!enable_ept) + hw_cr0 |= X86_CR0_WP; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -3049,7 +3045,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #endif if (enable_ept && !is_unrestricted_guest(vcpu)) - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + ept_update_paging_mode_cr0(cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); -- cgit v1.2.3 From 4f0dcb544038e016277fb691f1e60d52d7448cf6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:00 -0700 Subject: KVM: VMX: Remove direct write to vcpu->arch.cr0 during vCPU RESET/INIT Remove a bogus write to vcpu->arch.cr0 that immediately precedes vmx_set_cr0() during vCPU RESET/INIT. For RESET, this is a nop since the "old" CR0 value is meaningless. But for INIT, if the vCPU is coming from paging enabled mode, crushing vcpu->arch.cr0 will cause the various is_paging() checks in vmx_set_cr0() to get false negatives. For the exit_lmode() case, the false negative is benign as vmx_set_efer() is called immediately after vmx_set_cr0(). For EPT without unrestricted guest, the false negative will cause KVM to unnecessarily run with CR3 load/store exiting. But again, this is benign, albeit sub-optimal. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2e84c31efb81..1e555fb732bf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4387,7 +4387,6 @@ static void init_vmcs(struct vcpu_vmx *vmx) static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 cr0; vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; @@ -4455,9 +4454,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx->vcpu.arch.cr0 = cr0; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ + vmx_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); -- cgit v1.2.3 From c834fd7fc1308a0e0429d203a6c3af528cd902fa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:01 -0700 Subject: KVM: VMX: Fold ept_update_paging_mode_cr0() back into vmx_set_cr0() Move the CR0/CR3/CR4 shenanigans for EPT without unrestricted guest back into vmx_set_cr0(). This will allow a future patch to eliminate the rather gross stuffing of vcpu->arch.cr0 in the paging transition cases by snapshotting the old CR0. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-24-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1e555fb732bf..e4b1c24ad079 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2994,27 +2994,6 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } -static void ept_update_paging_mode_cr0(unsigned long cr0, struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } -} - void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3044,8 +3023,23 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (enable_ept && !is_unrestricted_guest(vcpu)) - ept_update_paging_mode_cr0(cr0, vcpu); + if (enable_ept && !is_unrestricted_guest(vcpu)) { + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); + if (!(cr0 & X86_CR0_PG)) { + /* From paging/starting to nonpaging */ + exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } else if (!is_paging(vcpu)) { + /* From nonpaging to paging */ + exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } + } vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); -- cgit v1.2.3 From 470750b3425513b9f63f176e564e63e0e7998afc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:02 -0700 Subject: KVM: nVMX: Do not clear CR3 load/store exiting bits if L1 wants 'em Keep CR3 load/store exiting enable as needed when running L2 in order to honor L1's desires. This fixes a largely theoretical bug where L1 could intercept CR3 but not CR0.PG and end up not getting the desired CR3 exits when L2 enables paging. In other words, the existing !is_paging() check inadvertantly handles the normal case for L2 where vmx_set_cr0() is called during VM-Enter, which is guaranteed to run with paging enabled, and thus will never clear the bits. Removing the !is_paging() check will also allow future consolidation and cleanup of the related code. From a performance perspective, this is all a nop, as the VMCS controls shadow will optimize away the VMWRITE when the controls are in the desired state. Add a comment explaining why CR3 is intercepted, with a big disclaimer about not querying the old CR3. Because vmx_set_cr0() is used for flows that are not directly tied to MOV CR3, e.g. vCPU RESET/INIT and nested VM-Enter, it's possible that is_paging() is not synchronized with CR3 load/store exiting. This is actually guaranteed in the current code, as KVM starts with CR3 interception disabled. Obviously that can be fixed, but there's no good reason to play whack-a-mole, and it tends to end poorly, e.g. descriptor table exiting for UMIP emulation attempted to be precise in the past and ended up botching the interception toggling. Fixes: fe3ef05c7572 ("KVM: nVMX: Prepare vmcs02 from vmcs01 and vmcs12") Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-25-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e4b1c24ad079..9f69ccc096a3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2994,10 +2994,14 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } +#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING) + void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0; + u32 tmp; hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (is_unrestricted_guest(vcpu)) @@ -3024,18 +3028,42 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #endif if (enable_ept && !is_unrestricted_guest(vcpu)) { + /* + * Ensure KVM has an up-to-date snapshot of the guest's CR3. If + * the below code _enables_ CR3 exiting, vmx_cache_reg() will + * (correctly) stop reading vmcs.GUEST_CR3 because it thinks + * KVM's CR3 is installed. + */ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) vmx_cache_reg(vcpu, VCPU_EXREG_CR3); + + /* + * When running with EPT but not unrestricted guest, KVM must + * intercept CR3 accesses when paging is _disabled_. This is + * necessary because restricted guests can't actually run with + * paging disabled, and so KVM stuffs its own CR3 in order to + * run the guest when identity mapped page tables. + * + * Do _NOT_ check the old CR0.PG, e.g. to optimize away the + * update, it may be stale with respect to CR3 interception, + * e.g. after nested VM-Enter. + * + * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or + * stores to forward them to L1, even if KVM does not need to + * intercept them to preserve its identity mapped page tables. + */ if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); + exec_controls_setbit(vmx, CR3_EXITING_BITS); + } else if (!is_guest_mode(vcpu)) { + exec_controls_clearbit(vmx, CR3_EXITING_BITS); + } else { + tmp = exec_controls_get(vmx); + tmp &= ~CR3_EXITING_BITS; + tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; + exec_controls_set(vmx, tmp); + } + + if (!is_paging(vcpu) != !(cr0 & X86_CR0_PG)) { vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } -- cgit v1.2.3 From 81ca0e7340eedcbe67911d97f292837e0cf39709 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:03 -0700 Subject: KVM: VMX: Pull GUEST_CR3 from the VMCS iff CR3 load exiting is disabled Tweak the logic for grabbing vmcs.GUEST_CR3 in vmx_cache_reg() to look directly at the execution controls, as opposed to effectively inferring the controls based on vCPUs. Inferring the controls isn't wrong, but it creates a very subtle dependency between the caching logic, the state of vcpu->arch.cr0 (via is_paging()), and the behavior of vmx_set_cr0(). Using the execution controls doesn't completely eliminate the dependency in vmx_set_cr0(), e.g. neglecting to cache CR3 before enabling interception would still break the guest, but it does reduce the code dependency and mostly eliminate the logical dependency (that CR3 loads are intercepted in certain scenarios). Eliminating the subtle read of vcpu->arch.cr0 will also allow for additional cleanup in vmx_set_cr0(). Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-26-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9f69ccc096a3..0c1e578967ff 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2262,8 +2262,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (is_unrestricted_guest(vcpu) || - (enable_ept && is_paging(vcpu))) + /* + * When intercepting CR3 loads, e.g. for shadowing paging, KVM's + * CR3 is loaded into hardware, not the guest's CR3. + */ + if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: -- cgit v1.2.3 From 908b7d43c02c5f1f95beff66ce4ca30dfc3b134c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:04 -0700 Subject: KVM: x86/mmu: Skip the permission_fault() check on MMIO if CR0.PG=0 Skip the MMU permission_fault() check if paging is disabled when verifying the cached MMIO GVA is usable. The check is unnecessary and can theoretically get a false positive since the MMU doesn't zero out "permissions" or "pkru_mask" when guest paging is disabled. The obvious alternative is to zero out all the bitmasks when configuring nonpaging MMUs, but that's unnecessary work and doesn't align with the MMU's general approach of doing as little as possible for flows that are supposed to be unreachable. This is nearly a nop as the false positive is nothing more than an insignificant performance blip, and more or less limited to string MMIO when L1 is running with paging disabled. KVM doesn't cache MMIO if L2 is active with nested TDP since the "GVA" is really an L2 GPA. If L2 is active without nested TDP, then paging can't be disabled as neither VMX nor SVM allows entering the guest without paging of some form. Jumping back to L1 with paging disabled, in that case direct_map is true and so KVM will use CR2 as a GPA; the only time it doesn't is if the fault from the emulator doesn't match or emulator_can_use_gpa(), and that fails only on string MMIO and other instructions with multiple memory operands. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-27-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ed8b6d1b4e85..c56788f8d180 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6568,9 +6568,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, * there is no pkey in EPT page table for L1 guest or EPT * shadow page table for L2 guest. */ - if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.mmio_access, 0, access)) { + if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || + !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.mmio_access, 0, access))) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); -- cgit v1.2.3 From 32437c2aea428f9c3479904e342b02f5eee3a7f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:05 -0700 Subject: KVM: VMX: Process CR0.PG side effects after setting CR0 assets Move the long mode and EPT w/o unrestricted guest side effect processing down in vmx_set_cr0() so that the EPT && !URG case doesn't have to stuff vcpu->arch.cr0 early. This also fixes an oddity where CR0 might not be marked available, i.e. the early vcpu->arch.cr0 write would appear to be in danger of being overwritten, though that can't actually happen in the current code since CR0.TS is the only guest-owned bit, and CR0.TS is not read by vmx_set_cr4(). Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-28-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0c1e578967ff..aa547100f0df 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3003,9 +3003,11 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; + unsigned long hw_cr0, old_cr0_pg; u32 tmp; + old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); + hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; @@ -3021,11 +3023,16 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); } + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, hw_cr0); + vcpu->arch.cr0 = cr0; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) + if (!old_cr0_pg && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) + else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) exit_lmode(vcpu); } #endif @@ -3066,17 +3073,11 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) exec_controls_set(vmx, tmp); } - if (!is_paging(vcpu) != !(cr0 & X86_CR0_PG)) { - vcpu->arch.cr0 = cr0; + /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ + if ((old_cr0_pg ^ cr0) & X86_CR0_PG) vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } } - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); - vcpu->arch.cr0 = cr0; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); - /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); } -- cgit v1.2.3 From 1dd7a4f18fbcc0db1acc07df5c1ae8ba4ce10593 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:06 -0700 Subject: KVM: VMX: Skip emulation required checks during pmode/rmode transitions Don't refresh "emulation required" when stuffing segments during transitions to/from real mode when running without unrestricted guest. The checks are unnecessary as vmx_set_cr0() unconditionally rechecks "emulation required". They also happen to be broken, as enter_pmode() and enter_rmode() run with a stale vcpu->arch.cr0. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-29-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa547100f0df..ff3f75105ae9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2719,6 +2719,8 @@ static __init int alloc_kvm_area(void) return 0; } +static void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); + static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { @@ -2735,7 +2737,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } - vmx_set_segment(vcpu, save, seg); + __vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) @@ -2756,7 +2758,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 0; - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; @@ -3291,7 +3293,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) return ar; } -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +static void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -3304,7 +3306,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write16(sf->selector, var->selector); else if (var->s) fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - goto out; + return; } vmcs_writel(sf->base, var->base); @@ -3326,9 +3328,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); +} -out: - vmx->emulation_required = emulation_required(vcpu); +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + __vmx_set_segment(vcpu, var, seg); + + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -- cgit v1.2.3 From 816be9e9be8d2e19dcea35fbec06f00cd5749fed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:07 -0700 Subject: KVM: nVMX: Don't evaluate "emulation required" on nested VM-Exit Use the "internal" variants of setting segment registers when stuffing state on nested VM-Exit in order to skip the "emulation required" updates. VM-Exit must always go to protected mode, and all segments are mostly hardcoded (to valid values) on VM-Exit. The bits of the segments that aren't hardcoded are explicitly checked during VM-Enter, e.g. the selector RPLs must all be zero. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-30-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 16 ++++++++-------- arch/x86/kvm/vmx/vmx.c | 6 ++---- arch/x86/kvm/vmx/vmx.h | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7f8184f432b4..a77cfc8bcf11 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4267,7 +4267,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, seg.l = 1; else seg.db = 1; - vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, @@ -4278,17 +4278,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .g = 1 }; seg.selector = vmcs12->host_ds_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); seg.selector = vmcs12->host_es_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); seg.selector = vmcs12->host_ss_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); seg.selector = vmcs12->host_fs_selector; seg.base = vmcs12->host_fs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); seg.selector = vmcs12->host_gs_selector; seg.base = vmcs12->host_gs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); seg = (struct kvm_segment) { .base = vmcs12->host_tr_base, .limit = 0x67, @@ -4296,11 +4296,11 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .type = 11, .present = 1 }; - vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); memset(&seg, 0, sizeof(seg)); seg.unusable = 1; - vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ff3f75105ae9..359c30be62bb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2719,8 +2719,6 @@ static __init int alloc_kvm_area(void) return 0; } -static void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); - static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { @@ -3293,7 +3291,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) return ar; } -static void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -3330,7 +3328,7 @@ static void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, in vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); } -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index db88ed4f2121..fd076bd26676 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -371,7 +371,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From ef8a0fa59be78e7b80bdf35c3391d60c061c7b24 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:08 -0700 Subject: KVM: SVM: Tweak order of cr0/cr4/efer writes at RESET/INIT Hoist svm_set_cr0() up in the sequence of register initialization during vCPU RESET/INIT, purely to match VMX so that a future patch can move the sequences to common x86. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-31-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9007adeb01fb..0afa060798a5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1249,18 +1249,13 @@ static void init_vmcb(struct kvm_vcpu *vcpu) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); + svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); svm_set_cr4(vcpu, 0); svm_set_efer(vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); vcpu->arch.regs[VCPU_REGS_RIP] = 0x0000fff0; - /* - * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. - * It also updates the guest-visible cr0 value. - */ - svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - save->cr4 = X86_CR4_PAE; if (npt_enabled) { -- cgit v1.2.3 From 6cfe7b83acdcdd4d4ea9354f007a88bc0ccb16b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:09 -0700 Subject: KVM: SVM: Drop redundant writes to vmcb->save.cr4 at RESET/INIT Drop direct writes to vmcb->save.cr4 during vCPU RESET/INIT, as the values being written are fully redundant with respect to svm_set_cr4(vcpu, 0) a few lines earlier. Note, svm_set_cr4() also correctly forces X86_CR4_PAE when NPT is disabled. No functional change intended. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-32-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0afa060798a5..1038bfdd7bbb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1256,8 +1256,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); vcpu->arch.regs[VCPU_REGS_RIP] = 0x0000fff0; - save->cr4 = X86_CR4_PAE; - if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; @@ -1267,7 +1265,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = vcpu->arch.pat; save->cr3 = 0; - save->cr4 = 0; } svm->current_vmcb->asid_generation = 0; svm->asid = 0; -- cgit v1.2.3 From d0f9f826d8ac06446391ceb3d4a440f5b48b3134 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:10 -0700 Subject: KVM: SVM: Stuff save->dr6 at during VMSA sync, not at RESET/INIT Move code to stuff vmcb->save.dr6 to its architectural init value from svm_vcpu_reset() into sev_es_sync_vmsa(). Except for protected guests, a.k.a. SEV-ES guests, vmcb->save.dr6 is set during VM-Enter, i.e. the extra write is unnecessary. For SEV-ES, stuffing save->dr6 handles a theoretical case where the VMSA could be encrypted before the first KVM_RUN. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-33-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 1 + arch/x86/kvm/svm/svm.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 6710d9ee2e4b..9f1585f40c85 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -584,6 +584,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xcr0 = svm->vcpu.arch.xcr0; save->pkru = svm->vcpu.arch.pkru; save->xss = svm->vcpu.arch.ia32_xss; + save->dr6 = svm->vcpu.arch.dr6; /* * SEV-ES will use a VMSA that is pointed to by the VMCB, not diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1038bfdd7bbb..64563f876fdf 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1252,7 +1252,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); svm_set_cr4(vcpu, 0); svm_set_efer(vcpu, 0); - save->dr6 = 0xffff0ff0; kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); vcpu->arch.regs[VCPU_REGS_RIP] = 0x0000fff0; -- cgit v1.2.3 From 400dd54b37172b64e4ceaa6bfaf2729585840375 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:11 -0700 Subject: KVM: VMX: Skip pointless MSR bitmap update when setting EFER Split setup_msrs() into vmx_setup_uret_msrs() and an open coded refresh of the MSR bitmap, and skip the latter when refreshing the user return MSRs during an EFER load. Only the x2APIC MSRs are dynamically exposed and hidden, and those are not affected by a change in EFER. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-34-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 359c30be62bb..d5d9fc5df259 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1647,11 +1647,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, } /* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. + * Configuring user return MSRs to automatically save, load, and restore MSRs + * that need to be shoved into hardware when running the guest. Note, omitting + * an MSR here does _NOT_ mean it's not emulated, only that it will not be + * loaded into hardware when running the guest. */ -static void setup_msrs(struct vcpu_vmx *vmx) +static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 bool load_syscall_msrs; @@ -1681,9 +1682,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) */ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(&vmx->vcpu); - /* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter. @@ -2874,7 +2872,7 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } - setup_msrs(vmx); + vmx_setup_uret_msrs(vmx); return 0; } @@ -4470,7 +4468,10 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); - setup_msrs(vmx); + vmx_setup_uret_msrs(vmx); + + if (cpu_has_vmx_msr_bitmap()) + vmx_update_msr_bitmap(&vmx->vcpu); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ -- cgit v1.2.3 From 432979b5034208890e323e86724417f02825abb7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:12 -0700 Subject: KVM: VMX: Refresh list of user return MSRs after setting guest CPUID After a CPUID update, refresh the list of user return MSRs that are loaded into hardware when running the vCPU. This is necessary to handle the oddball case where userspace exposes X86_FEATURE_RDTSCP to the guest after the vCPU is running. Fixes: 0023ef39dc35 ("kvm: vmx: Set IA32_TSC_AUX for legacy mode guests") Fixes: 4e47c7a6d714 ("KVM: VMX: Add instruction rdtscp support for guest") Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-35-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d5d9fc5df259..09c5a021e49b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7170,6 +7170,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ vcpu->arch.xsaves_enabled = false; + vmx_setup_uret_msrs(vmx); + if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); vmcs_set_secondary_exec_control(vmx); -- cgit v1.2.3 From c5c9f920f7a50ea205c9efec7e589556ebaf85dc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:13 -0700 Subject: KVM: VMX: Don't _explicitly_ reconfigure user return MSRs on vCPU INIT When emulating vCPU INIT, do not unconditionally refresh the list of user return MSRs that need to be loaded into hardware when running the guest. Unconditionally refreshing the list is confusing, as the vast majority of MSRs are not modified on INIT. The real motivation is to handle the case where an INIT during long mode obviates the need to load the SYSCALL MSRs, and that is handled as needed by vmx_set_efer(). Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-36-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 09c5a021e49b..2e2b469b8ced 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4410,6 +4410,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + vmx_setup_uret_msrs(vmx); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -4468,8 +4470,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); - vmx_setup_uret_msrs(vmx); - if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); -- cgit v1.2.3 From f39e805ee115a67878c5475f1ef84466d424bb2e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:14 -0700 Subject: KVM: x86: Move setting of sregs during vCPU RESET/INIT to common x86 Move the setting of CR0, CR4, EFER, RFLAGS, and RIP from vendor code to common x86. VMX and SVM now have near-identical sequences, the only difference being that VMX updates the exception bitmap. Updating the bitmap on SVM is unnecessary, but benign. Unfortunately it can't be left behind in VMX due to the need to update exception intercepts after the control registers are set. Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-37-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 6 ------ arch/x86/kvm/vmx/vmx.c | 9 --------- arch/x86/kvm/x86.c | 8 ++++++++ 3 files changed, 8 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 64563f876fdf..8ebdcd92007d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1249,12 +1249,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - svm_set_cr4(vcpu, 0); - svm_set_efer(vcpu, 0); - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - vcpu->arch.regs[VCPU_REGS_RIP] = 0x0000fff0; - if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2e2b469b8ced..7518d4cf8156 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4455,9 +4455,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0xfff0); - vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4485,12 +4482,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - vmx_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - vmx_set_cr4(vcpu, 0); - vmx_set_efer(vcpu, 0); - - vmx_update_exception_bitmap(vcpu); - vpid_sync_context(vmx->vpid); if (init_event) vmx_clear_hlt(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c56788f8d180..6c55f2e83a7c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10875,6 +10875,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + static_call(kvm_x86_set_cr0)(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); + static_call(kvm_x86_set_cr4)(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); + static_call(kvm_x86_update_exception_bitmap)(vcpu); + /* * Reset the MMU context if paging was enabled prior to INIT (which is * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the -- cgit v1.2.3 From 9e4784e19daaa5cd637753d5300fc5a42d0c694a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:15 -0700 Subject: KVM: VMX: Remove obsolete MSR bitmap refresh at vCPU RESET/INIT Remove an unnecessary MSR bitmap refresh during vCPU RESET/INIT. In both cases, the MSR bitmap already has the desired values and state. At RESET, the vCPU is guaranteed to be running with x2APIC disabled, the x2APIC MSRs are guaranteed to be intercepted due to the MSR bitmap being initialized to all ones by alloc_loaded_vmcs(), and vmx->msr_bitmap_mode is guaranteed to be zero, i.e. reflecting x2APIC disabled. At INIT, the APIC_BASE MSR is not modified, thus there can't be any change in x2APIC state. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-38-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7518d4cf8156..af7aa9bb4bdb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4467,9 +4467,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(&vmx->vcpu); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ if (cpu_has_vmx_tpr_shadow() && !init_event) { -- cgit v1.2.3 From 284036c644a1dc71890503c849b22c3126308067 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:16 -0700 Subject: KVM: nVMX: Remove obsolete MSR bitmap refresh at nested transitions Drop unnecessary MSR bitmap updates during nested transitions, as L1's APIC_BASE MSR is not modified by the standard VM-Enter/VM-Exit flows, and L2's MSR bitmap is managed separately. In the unlikely event that L1 is pathological and loads APIC_BASE via the VM-Exit load list, KVM will handle updating the bitmap in its normal WRMSR flows. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-39-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 ------ arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 1 - 3 files changed, 1 insertion(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a77cfc8bcf11..0d0dd6580cfd 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4305,9 +4305,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); @@ -4386,9 +4383,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - /* * This nasty bit of open coding is a compromise between blindly * loading L1's MSRs using the exit load lists (incorrect emulation diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index af7aa9bb4bdb..6e79f1b2653b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3865,7 +3865,7 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) } } -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u8 mode = vmx_msr_bitmap_mode(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index fd076bd26676..a408a9070662 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -376,7 +376,6 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 002f87a41e9a6ef795f7f493f9434f51fb9f7d35 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:17 -0700 Subject: KVM: VMX: Don't redo x2APIC MSR bitmaps when userspace filter is changed Drop an explicit call to update the x2APIC MSRs when the userspace MSR filter is modified. The x2APIC MSRs are deliberately exempt from userspace filtering. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-40-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6e79f1b2653b..7100e70ab718 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3936,7 +3936,6 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) } pt_update_intercept_for_msr(vcpu); - vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From e7c701dd7a5019c8628007c91c0e8d804c194667 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:18 -0700 Subject: KVM: VMX: Remove unnecessary initialization of msr_bitmap_mode Don't bother initializing msr_bitmap_mode to 0, all of struct vcpu_vmx is zero initialized. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-41-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7100e70ab718..308820173505 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6841,7 +6841,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } - vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); -- cgit v1.2.3 From 84ec8d2d539f7286d4504c2d377002f1ea7458d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:19 -0700 Subject: KVM: VMX: Smush x2APIC MSR bitmap adjustments into single function Consolidate all of the dynamic MSR bitmap adjustments into vmx_update_msr_bitmap_x2apic(), and rename the mode tracker to reflect that it is x2APIC specific. If KVM gains more cases of dynamic MSR pass-through, odds are very good that those new cases will be better off with their own logic, e.g. see Intel PT MSRs and MSR_IA32_SPEC_CTRL. Attempting to handle all updates in a common helper did more harm than good, as KVM ended up collecting a large number of useless "updates". Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-42-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 55 +++++++++++++++++++------------------------------- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 22 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 308820173505..96d9147620bf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3812,21 +3812,6 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) -{ - u8 mode = 0; - - if (cpu_has_secondary_exec_ctrls() && - (secondary_exec_controls_get(to_vmx(vcpu)) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - mode |= MSR_BITMAP_MODE_X2APIC; - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) - mode |= MSR_BITMAP_MODE_X2APIC_APICV; - } - - return mode; -} - static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; @@ -3844,11 +3829,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) } } -static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + u8 mode; + if (!cpu_has_vmx_msr_bitmap()) return; + if (cpu_has_secondary_exec_ctrls() && + (secondary_exec_controls_get(vmx) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode = MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } else { + mode = 0; + } + + if (mode == vmx->x2apic_msr_bitmap_mode) + return; + + vmx->x2apic_msr_bitmap_mode = mode; + vmx_reset_x2apic_msrs(vcpu, mode); /* @@ -3865,21 +3868,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) } } -static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u8 mode = vmx_msr_bitmap_mode(vcpu); - u8 changed = mode ^ vmx->msr_bitmap_mode; - - if (!changed) - return; - - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(vcpu, mode); - - vmx->msr_bitmap_mode = mode; -} - void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4139,8 +4127,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } u32 vmx_exec_control(struct vcpu_vmx *vmx) @@ -6190,7 +6177,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) } secondary_exec_controls_set(vmx, sec_exec_control); - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a408a9070662..0ecc41189292 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -227,7 +227,7 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; u8 fail; - u8 msr_bitmap_mode; + u8 x2apic_msr_bitmap_mode; /* * If true, host state has been stored in vmx->loaded_vmcs for -- cgit v1.2.3 From 7aa13fc3d8260c4215b8aea971d120e675d8781f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:20 -0700 Subject: KVM: VMX: Remove redundant write to set vCPU as active at RESET/INIT Drop a call to vmx_clear_hlt() during vCPU INIT, the guest's activity state is unconditionally set to "active" a few lines earlier in vmx_vcpu_reset(). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-43-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 96d9147620bf..b306ca2238ed 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4466,8 +4466,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); - if (init_event) - vmx_clear_hlt(vcpu); } static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From e54949408abfbe3905ae19ac3b42ecd2d29bc3c5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:21 -0700 Subject: KVM: VMX: Move RESET-only VMWRITE sequences to init_vmcs() Move VMWRITE sequences in vmx_vcpu_reset() guarded by !init_event into init_vmcs() to make it more obvious that they're, uh, initializing the VMCS. No meaningful functional change intended (though the order of VMWRITEs and whatnot is different). Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-44-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b306ca2238ed..ae8e62df16dd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4397,6 +4397,19 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (cpu_need_tpr_shadow(&vmx->vcpu)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + __pa(vmx->vcpu.arch.apic->regs)); + vmcs_write32(TPR_THRESHOLD, 0); + } + vmx_setup_uret_msrs(vmx); } @@ -4434,13 +4447,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - } - vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4455,14 +4461,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); - } - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); -- cgit v1.2.3 From 265e43530cb2eb14d5b78e89d310c79959e0eb26 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:22 -0700 Subject: KVM: SVM: Emulate #INIT in response to triple fault shutdown Emulate a full #INIT instead of simply initializing the VMCB if the guest hits a shutdown. Initializing the VMCB but not other vCPU state, much of which is mirrored by the VMCB, results in incoherent and broken vCPU state. Ideally, KVM would not automatically init anything on shutdown, and instead put the vCPU into e.g. KVM_MP_STATE_UNINITIALIZED and force userspace to explicitly INIT or RESET the vCPU. Even better would be to add KVM_MP_STATE_SHUTDOWN, since technically NMI can break shutdown (and SMI on Intel CPUs). But, that ship has sailed, and emulating #INIT is the next best thing as that has at least some connection with reality since there exist bare metal platforms that automatically INIT the CPU if it hits shutdown. Fixes: 46fe4ddd9dbb ("[PATCH] KVM: SVM: Propagate cpu shutdown events to userspace") Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-45-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 10 +++++++--- arch/x86/kvm/x86.c | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8ebdcd92007d..4102e9e3bcdc 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2050,11 +2050,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) return -EINVAL; /* - * VMCB is undefined after a SHUTDOWN intercept - * so reinitialize it. + * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put + * the VMCB in a known good state. Unfortuately, KVM doesn't have + * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking + * userspace. At a platform view, INIT is acceptable behavior as + * there exist bare metal platforms that automatically INIT the CPU + * in response to shutdown. */ clear_page(svm->vmcb); - init_vmcb(vcpu); + kvm_vcpu_reset(vcpu, true); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c55f2e83a7c..5b04c07c1ec5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10906,6 +10906,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (init_event) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } +EXPORT_SYMBOL_GPL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { -- cgit v1.2.3 From 46f4898b207ffeeeaebc0fdd4bf4082bf0f88107 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:23 -0700 Subject: KVM: SVM: Drop redundant clearing of vcpu->arch.hflags at INIT/RESET Drop redundant clears of vcpu->arch.hflags in init_vmcb() since kvm_vcpu_reset() always clears hflags, and it is also always zero at vCPU creation time. And of course, the second clearing in init_vmcb() was always redundant. Suggested-by: Reiji Watanabe Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-46-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4102e9e3bcdc..9d72b1df426e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1161,8 +1161,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - vcpu->arch.hflags = 0; - svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); svm_set_intercept(svm, INTERCEPT_CR4_READ); @@ -1264,7 +1262,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->nested.vmcb12_gpa = INVALID_GPA; svm->nested.last_vmcb12_gpa = INVALID_GPA; - vcpu->arch.hflags = 0; if (!kvm_pause_in_guest(vcpu->kvm)) { control->pause_filter_count = pause_filter_count; -- cgit v1.2.3 From 4c72ab5aa6e0ac2a5c11f9180e1fff89d7f2d38b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jul 2021 09:33:24 -0700 Subject: KVM: x86: Preserve guest's CR0.CD/NW on INIT Preserve CR0.CD and CR0.NW on INIT instead of forcing them to '1', as defined by both Intel's SDM and AMD's APM. Note, current versions of Intel's SDM are very poorly written with respect to INIT behavior. Table 9-1. "IA-32 and Intel 64 Processor States Following Power-up, Reset, or INIT" quite clearly lists power-up, RESET, _and_ INIT as setting CR0=60000010H, i.e. CD/NW=1. But the SDM then attempts to qualify CD/NW behavior in a footnote: 2. The CD and NW flags are unchanged, bit 4 is set to 1, all other bits are cleared. Presumably that footnote is only meant for INIT, as the RESET case and especially the power-up case are rather non-sensical. Another footnote all but confirms that: 6. Internal caches are invalid after power-up and RESET, but left unchanged with an INIT. Bare metal testing shows that CD/NW are indeed preserved on INIT (someone else can hack their BIOS to check RESET and power-up :-D). Reported-by: Reiji Watanabe Reviewed-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210713163324.627647-47-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5b04c07c1ec5..4d246b7f6ce1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10792,6 +10792,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long new_cr0; u32 eax, dummy; kvm_lapic_reset(vcpu, init_event); @@ -10878,7 +10879,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0xfff0); - static_call(kvm_x86_set_cr0)(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); + /* + * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions + * of Intel's SDM list CD/NW as being set on INIT, but they contradict + * (or qualify) that with a footnote stating that CD/NW are preserved. + */ + new_cr0 = X86_CR0_ET; + if (init_event) + new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD)); + else + new_cr0 |= X86_CR0_NW | X86_CR0_CD; + + static_call(kvm_x86_set_cr0)(vcpu, new_cr0); static_call(kvm_x86_set_cr4)(vcpu, 0); static_call(kvm_x86_set_efer)(vcpu, 0); static_call(kvm_x86_update_exception_bitmap)(vcpu); -- cgit v1.2.3 From db105fab8d141fc0d9179600c51eba0d168dad34 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 2 Aug 2021 08:52:50 -0400 Subject: KVM: nSVM: remove useless kvm_clear_*_queue For an event to be in injected state when nested_svm_vmrun executes, it must have come from exitintinfo when svm_complete_interrupts ran: vcpu_enter_guest static_call(kvm_x86_run) -> svm_vcpu_run svm_complete_interrupts // now the event went from "exitintinfo" to "injected" static_call(kvm_x86_handle_exit) -> handle_exit svm_invoke_exit_handler vmrun_interception nested_svm_vmrun However, no event could have been in exitintinfo before a VMRUN vmexit. The code in svm.c is a bit more permissive than the one in vmx.c: if (is_external_interrupt(svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) but in any case, a VMRUN instruction would not even start to execute during an attempted event delivery. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 61738ff8ef33..5e13357da21e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -659,11 +659,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) goto out; } - - /* Clear internal status */ - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); - /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. -- cgit v1.2.3 From 269e9552d208179bc14ea7f80a9e3e8ae97795a2 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Mon, 12 Jul 2021 22:33:38 -0400 Subject: KVM: const-ify all relevant uses of struct kvm_memory_slot As alluded to in commit f36f3f2846b5 ("KVM: add "new" argument to kvm_arch_commit_memory_region"), a bunch of other places where struct kvm_memory_slot is used, needs to be refactored to preserve the "const"ness of struct kvm_memory_slot across-the-board. Signed-off-by: Hamza Mahfooz Message-Id: <20210713023338.57108-1-someguy@effective-light.com> [Do not touch body of slot_rmap_walk_init. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/mmu/mmu.c | 42 ++++++++++++++++++++--------------------- arch/x86/kvm/mmu/mmu_internal.h | 4 ++-- arch/x86/kvm/mmu/tdp_mmu.c | 7 ++++--- arch/x86/kvm/mmu/tdp_mmu.h | 6 +++--- arch/x86/kvm/x86.c | 7 ++----- 6 files changed, 34 insertions(+), 36 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ec8e4aca69c8..99f37781a6fc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1537,12 +1537,12 @@ void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot); + const struct kvm_memory_slot *memslot); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 29010abb659c..e702361b4409 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -794,7 +794,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, +static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; @@ -807,12 +807,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, } } -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, 1); } -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, -1); } @@ -999,7 +999,7 @@ static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) } static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { unsigned long idx; @@ -1228,7 +1228,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * Returns true iff any D or W bits were cleared. */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1387,7 +1387,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1452,7 +1452,7 @@ restart: struct slot_rmap_walk_iterator { /* input fields. */ - struct kvm_memory_slot *slot; + const struct kvm_memory_slot *slot; gfn_t start_gfn; gfn_t end_gfn; int start_level; @@ -1479,7 +1479,7 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, - struct kvm_memory_slot *slot, int start_level, + const struct kvm_memory_slot *slot, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; @@ -5313,12 +5313,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot); +typedef bool (*slot_level_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, bool flush) @@ -5345,7 +5346,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, bool flush_on_yield) { @@ -5356,7 +5357,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, @@ -5615,7 +5616,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush = slot_handle_level_range(kvm, memslot, + flush = slot_handle_level_range(kvm, + (const struct kvm_memory_slot *) memslot, kvm_zap_rmapp, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); @@ -5643,13 +5645,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level) { bool flush = false; @@ -5685,7 +5687,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -5724,10 +5726,8 @@ restart: } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *slot) { - /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; bool flush = false; if (kvm_memslots_have_rmaps(kvm)) { @@ -5763,7 +5763,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { bool flush = false; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 626cb848dab4..ca7b7595bbfc 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -124,8 +124,8 @@ static inline bool is_nx_huge_page_enabled(void) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 41cee1d22918..43f12f5d12c0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1246,8 +1246,8 @@ retry: * only affect leaf SPTEs down to min_level. * Returns true if an SPTE has been changed and the TLBs need to be flushed. */ -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level) +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1317,7 +1317,8 @@ retry: * each SPTE. Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; bool spte_set = false; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 361b47f98cc5..b224d126adf9 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -61,10 +61,10 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level); +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level); bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, - struct kvm_memory_slot *slot); + const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4d246b7f6ce1..348452bb16bc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11520,7 +11520,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, - struct kvm_memory_slot *new, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; @@ -11600,10 +11600,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, kvm_mmu_calculate_default_mmu_pages(kvm)); - /* - * FIXME: const-ify all uses of struct kvm_memory_slot. - */ - kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); + kvm_mmu_slot_apply_flags(kvm, old, new, change); /* Free the arrays associated with the old memslot. */ if (change == KVM_MR_MOVE) -- cgit v1.2.3 From dc1cff969101afd08601e90463b44bd572e62dd4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Jul 2021 18:04:53 -0400 Subject: KVM: X86: MMU: Tune PTE_LIST_EXT to be bigger Currently rmap array element only contains 3 entries. However for EPT=N there could have a lot of guest pages that got tens of even hundreds of rmap entry. A normal distribution of a 6G guest (even if idle) shows this with rmap count statistics: Rmap_Count: 0 1 2-3 4-7 8-15 16-31 32-63 64-127 128-255 256-511 512-1023 Level=4K: 3089171 49005 14016 1363 235 212 15 7 0 0 0 Level=2M: 5951 227 0 0 0 0 0 0 0 0 0 Level=1G: 32 0 0 0 0 0 0 0 0 0 0 If we do some more fork some pages will grow even larger rmap counts. This patch makes PTE_LIST_EXT bigger so it'll be more efficient for the general use case of EPT=N as we do list reference less and the loops over PTE_LIST_EXT will be slightly more efficient; but still not too large so less waste when array not full. It should not affecting EPT=Y since EPT normally only has zero or one rmap entry for each page, so no array is even allocated. With a test case to fork 500 child and recycle them ("./rmap_fork 500" [1]), this patch speeds up fork time of about 29%. Before: 473.90 (+-5.93%) After: 366.10 (+-4.94%) [1] https://github.com/xzpeter/clibs/commit/825436f825453de2ea5aaee4bdb1c92281efe5b3 Signed-off-by: Peter Xu Message-Id: <20210730220455.26054-6-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e702361b4409..69ad8189d7eb 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -137,8 +137,8 @@ module_param(dbg, bool, 0644); #include -/* make pte_list_desc fit well in cache line */ -#define PTE_LIST_EXT 3 +/* make pte_list_desc fit well in cache lines */ +#define PTE_LIST_EXT 15 struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; -- cgit v1.2.3 From 13236e25ebab91b3e42ddedf5354b569ace1b555 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Jul 2021 18:06:02 -0400 Subject: KVM: X86: Optimize pte_list_desc with per-array counter Add a counter field into pte_list_desc, so as to simplify the add/remove/loop logic. E.g., we don't need to loop over the array any more for most reasons. This will make more sense after we've switched the array size to be larger otherwise the counter will be a waste. Initially I wanted to store a tail pointer at the head of the array list so we don't need to traverse the list at least for pushing new ones (if without the counter we traverse both the list and the array). However that'll need slightly more change without a huge lot benefit, e.g., after we grow entry numbers per array the list traversing is not so expensive. So let's be simple but still try to get as much benefit as we can with just these extra few lines of changes (not to mention the code looks easier too without looping over arrays). I used the same a test case to fork 500 child and recycle them ("./rmap_fork 500" [1]), this patch further speeds up the total fork time of about 4%, which is a total of 33% of vanilla kernel: Vanilla: 473.90 (+-5.93%) 3->15 slots: 366.10 (+-4.94%) Add counter: 351.00 (+-3.70%) [1] https://github.com/xzpeter/clibs/commit/825436f825453de2ea5aaee4bdb1c92281efe5b3 Signed-off-by: Peter Xu Message-Id: <20210730220602.26327-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 69ad8189d7eb..f1eb2fdfa473 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -138,11 +138,21 @@ module_param(dbg, bool, 0644); #include /* make pte_list_desc fit well in cache lines */ -#define PTE_LIST_EXT 15 +#define PTE_LIST_EXT 14 +/* + * Slight optimization of cacheline layout, by putting `more' and `spte_count' + * at the start; then accessing it will only use one single cacheline for + * either full (entries==PTE_LIST_EXT) case or entries<=6. + */ struct pte_list_desc { - u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; + /* + * Stores number of entries stored in the pte_list_desc. No need to be + * u64 but just for easier alignment. When PTE_LIST_EXT, means full. + */ + u64 spte_count; + u64 *sptes[PTE_LIST_EXT]; }; struct kvm_shadow_walk_iterator { @@ -901,7 +911,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - int i, count = 0; + int count = 0; if (!rmap_head->val) { rmap_printk("%p %llx 0->1\n", spte, *spte); @@ -911,24 +921,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; + desc->spte_count = 2; rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->sptes[PTE_LIST_EXT-1]) { + while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; - if (!desc->more) { desc->more = mmu_alloc_pte_list_desc(vcpu); desc = desc->more; + desc->spte_count = 0; break; } desc = desc->more; } - for (i = 0; desc->sptes[i]; ++i) - ++count; - desc->sptes[i] = spte; + count += desc->spte_count; + desc->sptes[desc->spte_count++] = spte; } return count; } @@ -938,13 +948,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) { - int j; + int j = desc->spte_count - 1; - for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) - ; desc->sptes[i] = desc->sptes[j]; desc->sptes[j] = NULL; - if (j != 0) + desc->spte_count--; + if (desc->spte_count) return; if (!prev_desc && !desc->more) rmap_head->val = 0; @@ -977,7 +986,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { + for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { pte_list_desc_remove_entry(rmap_head, desc, i, prev_desc); -- cgit v1.2.3 From a75b540451d20ef1aebaa09d183ddc5c44c6f86a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Jul 2021 18:06:05 -0400 Subject: KVM: X86: Optimize zapping rmap Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be slow. The easy way is to travers the rmap list, collecting the a/d bits and free the slots along the way. Provide a pte_list_destroy() and do exactly that. Signed-off-by: Peter Xu Message-Id: <20210730220605.26377-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f1eb2fdfa473..232ced2e7bf8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1007,6 +1007,34 @@ static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) __pte_list_remove(sptep, rmap_head); } +/* Return true if rmap existed, false otherwise */ +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head) +{ + struct pte_list_desc *desc, *next; + int i; + + if (!rmap_head->val) + return false; + + if (!(rmap_head->val & 1)) { + mmu_spte_clear_track_bits((u64 *)rmap_head->val); + goto out; + } + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + for (; desc; desc = next) { + for (i = 0; i < desc->spte_count; i++) + mmu_spte_clear_track_bits(desc->sptes[i]); + next = desc->more; + mmu_free_pte_list_desc(desc); + } +out: + /* rmap_head is meaningless now, remember to reset it */ + rmap_head->val = 0; + return true; +} + static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, const struct kvm_memory_slot *slot) { @@ -1398,18 +1426,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - while ((sptep = rmap_get_first(rmap_head, &iter))) { - rmap_printk("spte %p %llx.\n", sptep, *sptep); - - pte_list_remove(rmap_head, sptep); - flush = true; - } - - return flush; + return pte_list_destroy(rmap_head); } static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, -- cgit v1.2.3 From e79f49c37ccf273c8aba733f803b3774ebfbe581 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 28 Jul 2021 20:07:05 +0800 Subject: KVM: x86/pmu: Introduce pmc->is_paused to reduce the call time of perf interfaces Based on our observations, after any vm-exit associated with vPMU, there are at least two or more perf interfaces to be called for guest counter emulation, such as perf_event_{pause, read_value, period}(), and each one will {lock, unlock} the same perf_event_ctx. The frequency of calls becomes more severe when guest use counters in a multiplexed manner. Holding a lock once and completing the KVM request operations in the perf context would introduce a set of impractical new interfaces. So we can further optimize the vPMU implementation by avoiding repeated calls to these interfaces in the KVM context for at least one pattern: After we call perf_event_pause() once, the event will be disabled and its internal count will be reset to 0. So there is no need to pause it again or read its value. Once the event is paused, event period will not be updated until the next time it's resumed or reprogrammed. And there is also no need to call perf_event_period twice for a non-running counter, considering the perf_event for a running counter is never paused. Based on this implementation, for the following common usage of sampling 4 events using perf on a 4u8g guest: echo 0 > /proc/sys/kernel/watchdog echo 25 > /proc/sys/kernel/perf_cpu_time_max_percent echo 10000 > /proc/sys/kernel/perf_event_max_sample_rate echo 0 > /proc/sys/kernel/perf_cpu_time_max_percent for i in `seq 1 1 10` do taskset -c 0 perf record \ -e cpu-cycles -e instructions -e branch-instructions -e cache-misses \ /root/br_instr a done the average latency of the guest NMI handler is reduced from 37646.7 ns to 32929.3 ns (~1.14x speed up) on the Intel ICX server. Also, in addition to collecting more samples, no loss of sampling accuracy was observed compared to before the optimization. Signed-off-by: Like Xu Message-Id: <20210728120705.6855-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini Acked-by: Peter Zijlstra --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/pmu.c | 5 ++++- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 4 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 99f37781a6fc..a079880d4cd5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -482,6 +482,7 @@ struct kvm_pmc { * ctrl value for fixed counters. */ u64 current_config; + bool is_paused; }; struct kvm_pmu { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 827886c12c16..0772bad9165c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + pmc->is_paused = false; } static void pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter; - if (!pmc->perf_event) + if (!pmc->perf_event || pmc->is_paused) return; /* update counter, reset event value to avoid redundant accumulation */ counter += perf_event_pause(pmc->perf_event, true); pmc->counter = counter & pmc_bitmask(pmc); + pmc->is_paused = true; } static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); + pmc->is_paused = false; clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 67e753edfa22..0e4f2b1fa9fb 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) u64 counter, enabled, running; counter = pmc->counter; - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); /* FIXME: Scaling needed? */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9efc1a6b8693..10cc4f65c4ef 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; -- cgit v1.2.3 From eb48d154cd0dade56a0e244f0cfa198ea2925ed3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Aug 2021 13:38:29 +0100 Subject: arm64: Move .hyp.rodata outside of the _sdata.._edata range The HYP rodata section is currently lumped together with the BSS, which isn't exactly what is expected (it gets registered with kmemleak, for example). Move it away so that it is actually marked RO. As an added benefit, it isn't registered with kmemleak anymore. Fixes: 380e18ade4a5 ("KVM: arm64: Introduce a BSS section for use at Hyp") Suggested-by: Catalin Marinas Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org #5.13 Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20210802123830.2195174-2-maz@kernel.org --- arch/arm64/kernel/vmlinux.lds.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 709d2c433c5e..f6b1a88245db 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -181,6 +181,8 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) + HYPERVISOR_DATA_SECTIONS + idmap_pg_dir = .; . += IDMAP_DIR_SIZE; idmap_pg_end = .; @@ -260,8 +262,6 @@ SECTIONS _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) - HYPERVISOR_DATA_SECTIONS - /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback -- cgit v1.2.3 From 47e6223c841e029bfc23c3ce594dac5525cebaf8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Aug 2021 13:38:30 +0100 Subject: KVM: arm64: Unregister HYP sections from kmemleak in protected mode Booting a KVM host in protected mode with kmemleak quickly results in a pretty bad crash, as kmemleak doesn't know that the HYP sections have been taken away. This is specially true for the BSS section, which is part of the kernel BSS section and registered at boot time by kmemleak itself. Unregister the HYP part of the BSS before making that section HYP-private. The rest of the HYP-specific data is obtained via the page allocator or lives in other sections, none of which is subjected to kmemleak. Fixes: 90134ac9cabb ("KVM: arm64: Protect the .hyp sections from the host") Reviewed-by: Quentin Perret Reviewed-by: Catalin Marinas Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org # 5.13 Link: https://lore.kernel.org/r/20210802123830.2195174-3-maz@kernel.org --- arch/arm64/kvm/arm.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e9a2b8f27792..52242f32c4be 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1982,6 +1983,12 @@ static int finalize_hyp_mode(void) if (ret) return ret; + /* + * Exclude HYP BSS from kmemleak so that it doesn't get peeked + * at, which would end badly once the section is inaccessible. + * None of other sections should ever be introspected. + */ + kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); ret = pkvm_mark_hyp_section(__hyp_bss); if (ret) return ret; -- cgit v1.2.3 From 319afe68567b923e25140e744e7f05e3e5d889c1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 4 Aug 2021 12:48:41 -0400 Subject: KVM: xen: do not use struct gfn_to_hva_cache gfn_to_hva_cache is not thread-safe, so it is usually used only within a vCPU (whose code is protected by vcpu->mutex). The Xen interface implementation has such a cache in kvm->arch, but it is not really used except to store the location of the shared info page. Replace shinfo_set and shinfo_cache with just the value that is passed via KVM_XEN_ATTR_TYPE_SHARED_INFO; the only complication is that the initialization value is not zero anymore and therefore kvm_xen_init_vm needs to be introduced. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/x86.c | 1 + arch/x86/kvm/xen.c | 23 ++++++++++++----------- arch/x86/kvm/xen.h | 5 +++++ 4 files changed, 19 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a079880d4cd5..6a73ff7db5f9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1003,9 +1003,8 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { bool long_mode; - bool shinfo_set; u8 upcall_vector; - struct gfn_to_hva_cache shinfo_cache; + gfn_t shinfo_gfn; }; enum kvm_irqchip_mode { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 348452bb16bc..3cedc7cc132a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11162,6 +11162,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); + kvm_xen_init_vm(kvm); return static_call(kvm_x86_vm_init)(kvm); } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index ae17250e1efe..9ea9c3dabe37 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -25,15 +25,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { gpa_t gpa = gfn_to_gpa(gfn); int wc_ofs, sec_hi_ofs; - int ret; + int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache, - gpa, PAGE_SIZE); - if (ret) + if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) { + ret = -EFAULT; goto out; - - kvm->arch.xen.shinfo_set = true; + } + kvm->arch.xen.shinfo_gfn = gfn; /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -245,7 +244,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) case KVM_XEN_ATTR_TYPE_SHARED_INFO: if (data->u.shared_info.gfn == GPA_INVALID) { - kvm->arch.xen.shinfo_set = false; + kvm->arch.xen.shinfo_gfn = GPA_INVALID; r = 0; break; } @@ -283,10 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_set) - data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); - else - data->u.shared_info.gfn = GPA_INVALID; + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); r = 0; break; @@ -646,6 +642,11 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) return 0; } +void kvm_xen_init_vm(struct kvm *kvm) +{ + kvm->arch.xen.shinfo_gfn = GPA_INVALID; +} + void kvm_xen_destroy_vm(struct kvm *kvm) { if (kvm->arch.xen_hvm_config.msr) diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 463a7844a8ca..cc0cf5f37450 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -21,6 +21,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); +void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); static inline bool kvm_xen_msr_enabled(struct kvm *kvm) @@ -50,6 +51,10 @@ static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) return 1; } +static inline void kvm_xen_init_vm(struct kvm *kvm) +{ +} + static inline void kvm_xen_destroy_vm(struct kvm *kvm) { } -- cgit v1.2.3 From 87689270b10fa9e6fac7242233b355cb6792b845 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 4 Aug 2021 22:28:38 +0000 Subject: KVM: Rename lru_slot to last_used_slot lru_slot is used to keep track of the index of the most-recently used memslot. The correct acronym would be "mru" but that is not a common acronym. So call it last_used_slot which is a bit more obvious. Suggested-by: Paolo Bonzini Signed-off-by: David Matlack Message-Id: <20210804222844.1419481-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 4 ++-- include/linux/kvm_host.h | 6 +++--- virt/kvm/kvm_main.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4527ac7b5961..02574d7b3612 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1953,7 +1953,7 @@ out: static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); + int slot = atomic_read(&slots->last_used_slot); struct kvm_memory_slot *memslots = slots->memslots; if (gfn >= memslots[slot].base_gfn && @@ -1974,7 +1974,7 @@ static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) if (gfn >= memslots[start].base_gfn && gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); + atomic_set(&slots->last_used_slot, start); } return start; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5b6a69caccb5..bdfd5ed539c9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -522,7 +522,7 @@ struct kvm_memslots { u64 generation; /* The mapping table from slot id to the index in memslots[]. */ short id_to_index[KVM_MEM_SLOTS_NUM]; - atomic_t lru_slot; + atomic_t last_used_slot; int used_slots; struct kvm_memory_slot memslots[]; }; @@ -1200,7 +1200,7 @@ static inline struct kvm_memory_slot * search_memslots(struct kvm_memslots *slots, gfn_t gfn) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); + int slot = atomic_read(&slots->last_used_slot); struct kvm_memory_slot *memslots = slots->memslots; if (unlikely(!slots->used_slots)) @@ -1221,7 +1221,7 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn) if (start < slots->used_slots && gfn >= memslots[start].base_gfn && gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); + atomic_set(&slots->last_used_slot, start); return &memslots[start]; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 930aeb8d3c3e..1984c7389787 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1224,8 +1224,8 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots, slots->used_slots--; - if (atomic_read(&slots->lru_slot) >= slots->used_slots) - atomic_set(&slots->lru_slot, 0); + if (atomic_read(&slots->last_used_slot) >= slots->used_slots) + atomic_set(&slots->last_used_slot, 0); for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { mslots[i] = mslots[i + 1]; -- cgit v1.2.3 From 0f22af940dc8ec4f437189096a5f8677995323b0 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 4 Aug 2021 22:28:39 +0000 Subject: KVM: Move last_used_slot logic out of search_memslots Make search_memslots unconditionally search all memslots and move the last_used_slot logic up one level to __gfn_to_memslot. This is in preparation for introducing a per-vCPU last_used_slot. As part of this change convert existing callers of search_memslots to __gfn_to_memslot to avoid making any functional changes. Signed-off-by: David Matlack Message-Id: <20210804222844.1419481-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_64_vio.c | 2 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 2 +- include/linux/kvm_host.h | 64 +++++++++++++++++++++++++++---------- 3 files changed, 50 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8da93fdfa59e..6365087f3160 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -346,7 +346,7 @@ static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); if (!memslot) return -EINVAL; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index dc6591548f0c..f38dfe195ef2 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -80,7 +80,7 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots_raw(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); if (!memslot) return -EINVAL; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bdfd5ed539c9..f30b53a07917 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1189,29 +1189,43 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* - * search_memslots() and __gfn_to_memslot() are here because they are - * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. - * gfn_to_memslot() itself isn't here as an inline because that would - * bloat other code too much. + * Returns a pointer to the memslot at slot_index if it contains gfn. + * Otherwise returns NULL. + */ +static inline struct kvm_memory_slot * +try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + if (slot_index < 0 || slot_index >= slots->used_slots) + return NULL; + + slot = &slots->memslots[slot_index]; + + if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) + return slot; + else + return NULL; +} + +/* + * Returns a pointer to the memslot that contains gfn and records the index of + * the slot in index. Otherwise returns NULL. * * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! */ static inline struct kvm_memory_slot * -search_memslots(struct kvm_memslots *slots, gfn_t gfn) +search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->last_used_slot); struct kvm_memory_slot *memslots = slots->memslots; + struct kvm_memory_slot *slot; if (unlikely(!slots->used_slots)) return NULL; - if (gfn >= memslots[slot].base_gfn && - gfn < memslots[slot].base_gfn + memslots[slot].npages) - return &memslots[slot]; - while (start < end) { - slot = start + (end - start) / 2; + int slot = start + (end - start) / 2; if (gfn >= memslots[slot].base_gfn) end = slot; @@ -1219,19 +1233,37 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn) start = slot + 1; } - if (start < slots->used_slots && gfn >= memslots[start].base_gfn && - gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->last_used_slot, start); - return &memslots[start]; + slot = try_get_memslot(slots, start, gfn); + if (slot) { + *index = start; + return slot; } return NULL; } +/* + * __gfn_to_memslot() and its descendants are here because it is called from + * non-modular code in arch/powerpc/kvm/book3s_64_vio{,_hv}.c. gfn_to_memslot() + * itself isn't here as an inline because that would bloat other code too much. + */ static inline struct kvm_memory_slot * __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) { - return search_memslots(slots, gfn); + struct kvm_memory_slot *slot; + int slot_index = atomic_read(&slots->last_used_slot); + + slot = try_get_memslot(slots, slot_index, gfn); + if (slot) + return slot; + + slot = search_memslots(slots, gfn, &slot_index); + if (slot) { + atomic_set(&slots->last_used_slot, slot_index); + return slot; + } + + return NULL; } static inline unsigned long -- cgit v1.2.3 From 081de470f1e6e83f9f460ba5ae8f57ff07f37692 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 4 Aug 2021 22:28:41 +0000 Subject: KVM: x86/mmu: Leverage vcpu->last_used_slot in tdp_mmu_map_handle_target_level The existing TDP MMU methods to handle dirty logging are vcpu-agnostic since they can be driven by MMU notifiers and other non-vcpu-specific events in addition to page faults. However this means that the TDP MMU is not benefiting from the new vcpu->last_used_slot. Fix that by introducing a tdp_mmu_map_set_spte_atomic() which is only called during a TDP page fault and has access to the kvm_vcpu for fast slot lookups. This improves "Populate memory time" in dirty_log_perf_test by 5%: Command | Before | After ------------------------------- | ---------------- | ------------- ./dirty_log_perf_test -v64 -x64 | 5.472321072s | 5.169832886s Signed-off-by: David Matlack Message-Id: <20210804222844.1419481-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 43f12f5d12c0..dab6cb46cdb2 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -542,15 +542,40 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +/* + * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a + * TDP page fault. + * + * @vcpu: The vcpu instance that took the TDP page fault. + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, + struct tdp_iter *iter, + u64 new_spte) { + struct kvm *kvm = vcpu->kvm; + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) return false; - handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, - iter->old_spte, new_spte, iter->level); + /* + * Use kvm_vcpu_gfn_to_memslot() instead of going through + * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. + */ + if (is_writable_pte(new_spte)) { + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); + + if (slot && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON_ONCE(iter->level > PG_LEVEL_4K); + mark_page_dirty_in_slot(kvm, slot, iter->gfn); + } + } + return true; } @@ -563,7 +588,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -931,7 +956,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) return RET_PF_RETRY; /* @@ -1035,8 +1060,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, - new_spte)) { + if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { tdp_mmu_link_page(vcpu->kvm, sp, true, huge_page_disallowed && req_level >= iter.level); -- cgit v1.2.3 From 601f8af01e5ae535b45cdb91234887c9fd861ad4 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 4 Aug 2021 22:28:42 +0000 Subject: KVM: x86/mmu: Leverage vcpu->last_used_slot for rmap_add and rmap_recycle rmap_add() and rmap_recycle() both run in the context of the vCPU and thus we can use kvm_vcpu_gfn_to_memslot() to look up the memslot. This enables rmap_add() and rmap_recycle() to take advantage of vcpu->last_used_slot and avoid expensive memslot searching. This change improves the performance of "Populate memory time" in dirty_log_perf_test with tdp_mmu=N. In addition to improving the performance, "Populate memory time" no longer scales with the number of memslots in the VM. Command | Before | After ------------------------------- | ---------------- | ------------- ./dirty_log_perf_test -v64 -x1 | 15.18001570s | 14.99469366s ./dirty_log_perf_test -v64 -x64 | 18.71336392s | 14.98675076s Reviewed-by: Paolo Bonzini Signed-off-by: David Matlack Message-Id: <20210804222844.1419481-6-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 232ced2e7bf8..3a0ae48a26e9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1044,17 +1044,6 @@ static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } -static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, - struct kvm_mmu_page *sp) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, gfn); - return __gfn_to_rmap(gfn, sp->role.level, slot); -} - static bool rmap_can_add(struct kvm_vcpu *vcpu) { struct kvm_mmu_memory_cache *mc; @@ -1065,24 +1054,39 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = __gfn_to_rmap(gfn, sp->role.level, slot); return pte_list_add(vcpu, spte, rmap_head); } + static void rmap_remove(struct kvm *kvm, u64 *spte) { + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; gfn_t gfn; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmap_head = gfn_to_rmap(kvm, gfn, sp); + + /* + * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the + * context of a vCPU so have to determine which memslots to use based + * on context information in sp->role. + */ + slots = kvm_memslots_for_spte_role(kvm, sp->role); + + slot = __gfn_to_memslot(slots, gfn); + rmap_head = __gfn_to_rmap(gfn, sp->role.level, slot); + __pte_list_remove(spte, rmap_head); } @@ -1620,12 +1624,13 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); - - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = __gfn_to_rmap(gfn, sp->role.level, slot); kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, -- cgit v1.2.3 From 93e083d4f4bfe790eb1cdc87103bd6a84be9df75 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 4 Aug 2021 22:28:43 +0000 Subject: KVM: x86/mmu: Rename __gfn_to_rmap to gfn_to_rmap gfn_to_rmap was removed in the previous patch so there is no need to retain the double underscore on __gfn_to_rmap. Reviewed-by: Paolo Bonzini Signed-off-by: David Matlack Message-Id: <20210804222844.1419481-7-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 25 ++++++++++++------------- arch/x86/kvm/mmu/mmu_audit.c | 4 ++-- 2 files changed, 14 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3a0ae48a26e9..964c797dcc46 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1035,8 +1035,8 @@ out: return true; } -static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, - const struct kvm_memory_slot *slot) +static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, + const struct kvm_memory_slot *slot) { unsigned long idx; @@ -1061,7 +1061,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - rmap_head = __gfn_to_rmap(gfn, sp->role.level, slot); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); return pte_list_add(vcpu, spte, rmap_head); } @@ -1085,7 +1085,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - rmap_head = __gfn_to_rmap(gfn, sp->role.level, slot); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); __pte_list_remove(spte, rmap_head); } @@ -1307,8 +1307,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1340,8 +1340,8 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ @@ -1407,7 +1407,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); + rmap_head = gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } } @@ -1502,9 +1502,8 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; - iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); - iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, - iterator->slot); + iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } static void @@ -1630,7 +1629,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = sptep_to_sp(spte); slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - rmap_head = __gfn_to_rmap(gfn, sp->role.level, slot); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index cedc17b2f60e..9e7dcf999f08 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -147,7 +147,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot); + rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot); if (!rmap_head->val) { if (!__ratelimit(&ratelimit_state)) return; @@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) -- cgit v1.2.3 From d21292f13f1f0721d60e8122e2db46bea8cf6950 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 9 Aug 2021 16:24:28 +0100 Subject: KVM: arm64: Add hyp_spin_is_locked() for basic locking assertions at EL2 Introduce hyp_spin_is_locked() so that functions can easily assert that a given lock is held (albeit possibly by another CPU!) without having to drag full lockdep support up to EL2. Signed-off-by: Will Deacon Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-2-qperret@google.com --- arch/arm64/kvm/hyp/include/nvhe/spinlock.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h index 76b537f8d1c6..04f65b655fcf 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -15,6 +15,7 @@ #include #include +#include typedef union hyp_spinlock { u32 __val; @@ -89,4 +90,11 @@ static inline void hyp_spin_unlock(hyp_spinlock_t *lock) : "memory"); } +static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock) +{ + hyp_spinlock_t lockval = READ_ONCE(*lock); + + return lockval.owner != lockval.next; +} + #endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */ -- cgit v1.2.3 From 8e049e0daf23aa380c264e5e15e4c64ea5497ed7 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:29 +0100 Subject: KVM: arm64: Introduce hyp_assert_lock_held() Introduce a poor man's lockdep implementation at EL2 which allows to BUG() whenever a hyp spinlock is not held when it should. Hide this feature behind a new Kconfig option that targets the EL2 object specifically, instead of piggy backing on the existing CONFIG_LOCKDEP. EL2 cannot WARN() cleanly to report locking issues, hence BUG() is the only option and it is not clear whether we want this widely enabled. This is most likely going to be useful for local testing until the EL2 WARN() situation has improved. Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-3-qperret@google.com --- arch/arm64/kvm/Kconfig | 9 +++++++++ arch/arm64/kvm/hyp/include/nvhe/spinlock.h | 17 +++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a4eba0908bfa..9b9721895e5c 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -46,6 +46,15 @@ if KVM source "virt/kvm/Kconfig" +config NVHE_EL2_DEBUG + bool "Debug mode for non-VHE EL2 object" + help + Say Y here to enable the debug mode for the non-VHE KVM EL2 object. + Failure reports will BUG() in the hypervisor. This is intended for + local EL2 hypervisor development. + + If unsure, say N. + endif # KVM endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h index 04f65b655fcf..4652fd04bdbe 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -97,4 +97,21 @@ static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock) return lockval.owner != lockval.next; } +#ifdef CONFIG_NVHE_EL2_DEBUG +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) +{ + /* + * The __pkvm_init() path accesses protected data-structures without + * holding locks as the other CPUs are guaranteed to not enter EL2 + * concurrently at this point in time. The point by which EL2 is + * initialized on all CPUs is reflected in the pkvm static key, so + * wait until it is set before checking the lock state. + */ + if (static_branch_likely(&kvm_protected_mode_initialized)) + BUG_ON(!hyp_spin_is_locked(lock)); +} +#else +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { } +#endif + #endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */ -- cgit v1.2.3 From 1bac49d490cbc813f407a5c9806e464bf4a300c9 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:30 +0100 Subject: KVM: arm64: Provide the host_stage2_try() helper macro We currently unmap all MMIO mappings from the host stage-2 to recycle the pages whenever we run out. In order to make this pattern easy to re-use from other places, factor the logic out into a dedicated macro. While at it, apply the macro for the kvm_pgtable_stage2_set_owner() calls. They're currently only called early on and are guaranteed to succeed, but making them robust to the -ENOMEM case doesn't hurt and will avoid painful debugging sessions later on. Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-4-qperret@google.com --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 40 +++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index d938ce95d3bd..74280a753efb 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -208,6 +208,25 @@ static inline int __host_stage2_idmap(u64 start, u64 end, prot, &host_s2_pool); } +/* + * The pool has been provided with enough pages to cover all of memory with + * page granularity, but it is difficult to know how much of the MMIO range + * we will need to cover upfront, so we may need to 'recycle' the pages if we + * run out. + */ +#define host_stage2_try(fn, ...) \ + ({ \ + int __ret; \ + hyp_assert_lock_held(&host_kvm.lock); \ + __ret = fn(__VA_ARGS__); \ + if (__ret == -ENOMEM) { \ + __ret = host_stage2_unmap_dev_all(); \ + if (!__ret) \ + __ret = fn(__VA_ARGS__); \ + } \ + __ret; \ + }) + static int host_stage2_idmap(u64 addr) { enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W; @@ -223,22 +242,7 @@ static int host_stage2_idmap(u64 addr) if (ret) goto unlock; - ret = __host_stage2_idmap(range.start, range.end, prot); - if (ret != -ENOMEM) - goto unlock; - - /* - * The pool has been provided with enough pages to cover all of memory - * with page granularity, but it is difficult to know how much of the - * MMIO range we will need to cover upfront, so we may need to 'recycle' - * the pages if we run out. - */ - ret = host_stage2_unmap_dev_all(); - if (ret) - goto unlock; - - ret = __host_stage2_idmap(range.start, range.end, prot); - + ret = host_stage2_try(__host_stage2_idmap, range.start, range.end, prot); unlock: hyp_spin_unlock(&host_kvm.lock); @@ -257,8 +261,8 @@ int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) return -EINVAL; hyp_spin_lock(&host_kvm.lock); - ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start, - &host_s2_pool, pkvm_hyp_id); + ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, + start, end - start, &host_s2_pool, pkvm_hyp_id); hyp_spin_unlock(&host_kvm.lock); return ret != -EAGAIN ? ret : 0; -- cgit v1.2.3 From 51add457733bbc4a442fc280d73d14bfe262e4a0 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:32 +0100 Subject: KVM: arm64: Expose page-table helpers The KVM pgtable API exposes the kvm_pgtable_walk() function to allow the definition of walkers outside of pgtable.c. However, it is not easy to implement any of those walkers without some of the low-level helpers. Move some of them to the header file to allow re-use from other places. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-6-qperret@google.com --- arch/arm64/include/asm/kvm_pgtable.h | 40 ++++++++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 39 ----------------------------------- 2 files changed, 40 insertions(+), 39 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index e42b55bd50a2..83c21d35be10 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -25,6 +25,46 @@ static inline u64 kvm_get_parange(u64 mmfr0) typedef u64 kvm_pte_t; +#define KVM_PTE_VALID BIT(0) + +#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) + +static inline bool kvm_pte_valid(kvm_pte_t pte) +{ + return pte & KVM_PTE_VALID; +} + +static inline u64 kvm_pte_to_phys(kvm_pte_t pte) +{ + u64 pa = pte & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + + return pa; +} + +static inline u64 kvm_granule_shift(u32 level) +{ + /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); +} + +static inline u64 kvm_granule_size(u32 level) +{ + return BIT(kvm_granule_shift(level)); +} + +static inline bool kvm_level_supports_block_mapping(u32 level) +{ + /* + * Reject invalid block mappings and don't bother with 4TB mappings for + * 52-bit PAs. + */ + return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); +} + /** * struct kvm_pgtable_mm_ops - Memory management callbacks. * @zalloc_page: Allocate a single zeroed memory page. diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 78f36bd5df6c..49d768b92997 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -11,16 +11,12 @@ #include #include -#define KVM_PTE_VALID BIT(0) #define KVM_PTE_TYPE BIT(1) #define KVM_PTE_TYPE_BLOCK 0 #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 -#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) -#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) - #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) @@ -61,17 +57,6 @@ struct kvm_pgtable_walk_data { u64 end; }; -static u64 kvm_granule_shift(u32 level) -{ - /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ - return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); -} - -static u64 kvm_granule_size(u32 level) -{ - return BIT(kvm_granule_shift(level)); -} - #define KVM_PHYS_INVALID (-1ULL) static bool kvm_phys_is_valid(u64 phys) @@ -79,15 +64,6 @@ static bool kvm_phys_is_valid(u64 phys) return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX)); } -static bool kvm_level_supports_block_mapping(u32 level) -{ - /* - * Reject invalid block mappings and don't bother with 4TB mappings for - * 52-bit PAs. - */ - return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); -} - static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) { u64 granule = kvm_granule_size(level); @@ -135,11 +111,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; } -static bool kvm_pte_valid(kvm_pte_t pte) -{ - return pte & KVM_PTE_VALID; -} - static bool kvm_pte_table(kvm_pte_t pte, u32 level) { if (level == KVM_PGTABLE_MAX_LEVELS - 1) @@ -151,16 +122,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level) return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; } -static u64 kvm_pte_to_phys(kvm_pte_t pte) -{ - u64 pa = pte & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; - - return pa; -} - static kvm_pte_t kvm_phys_to_pte(u64 pa) { kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; -- cgit v1.2.3 From c4f0935e4d957bfcea25ad76860445660a60f3fd Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:33 +0100 Subject: KVM: arm64: Optimize host memory aborts The kvm_pgtable_stage2_find_range() function is used in the host memory abort path to try and look for the largest block mapping that can be used to map the faulting address. In order to do so, the function currently walks the stage-2 page-table and looks for existing incompatible mappings within the range of the largest possible block. If incompatible mappings are found, it tries the same procedure again, but using a smaller block range, and repeats until a matching range is found (potentially up to page granularity). While this approach has benefits (mostly in the fact that it proactively coalesces host stage-2 mappings), it can be slow if the ranges are fragmented, and it isn't optimized to deal with CPUs faulting on the same IPA as all of them will do all the work every time. To avoid these issues, remove kvm_pgtable_stage2_find_range(), and walk the page-table only once in the host_mem_abort() path to find the closest leaf to the input address. With this, use the corresponding range if it is invalid and not owned by another entity. If a valid leaf is found, return -EAGAIN similar to what is done in the kvm_pgtable_stage2_map() path to optimize concurrent faults. Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-7-qperret@google.com --- arch/arm64/include/asm/kvm_pgtable.h | 30 -------------- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 45 ++++++++++++++++++++- arch/arm64/kvm/hyp/pgtable.c | 74 ----------------------------------- 3 files changed, 44 insertions(+), 105 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 83c21d35be10..8d6c710c1996 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -154,16 +154,6 @@ enum kvm_pgtable_prot { #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) #define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) -/** - * struct kvm_mem_range - Range of Intermediate Physical Addresses - * @start: Start of the range. - * @end: End of the range. - */ -struct kvm_mem_range { - u64 start; - u64 end; -}; - /** * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid @@ -491,24 +481,4 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, */ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, kvm_pte_t *ptep, u32 *level); - -/** - * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical - * Addresses with compatible permission - * attributes. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). - * @addr: Address that must be covered by the range. - * @prot: Protection attributes that the range must be compatible with. - * @range: Range structure used to limit the search space at call time and - * that will hold the result. - * - * The offset of @addr within a page is ignored. An IPA is compatible with @prot - * iff its corresponding stage-2 page-table entry has default ownership and, if - * valid, is mapped with protection attributes identical to @prot. - * - * Return: 0 on success, negative error code on failure. - */ -int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot, - struct kvm_mem_range *range); #endif /* __ARM64_KVM_PGTABLE_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 74280a753efb..2148d3968aa5 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -159,6 +159,11 @@ static int host_stage2_unmap_dev_all(void) return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); } +struct kvm_mem_range { + u64 start; + u64 end; +}; + static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) { int cur, left = 0, right = hyp_memblock_nr; @@ -227,6 +232,44 @@ static inline int __host_stage2_idmap(u64 start, u64 end, __ret; \ }) +static inline bool range_included(struct kvm_mem_range *child, + struct kvm_mem_range *parent) +{ + return parent->start <= child->start && child->end <= parent->end; +} + +static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) +{ + struct kvm_mem_range cur; + kvm_pte_t pte; + u32 level; + int ret; + + hyp_assert_lock_held(&host_kvm.lock); + ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level); + if (ret) + return ret; + + if (kvm_pte_valid(pte)) + return -EAGAIN; + + if (pte) + return -EPERM; + + do { + u64 granule = kvm_granule_size(level); + cur.start = ALIGN_DOWN(addr, granule); + cur.end = cur.start + granule; + level++; + } while ((level < KVM_PGTABLE_MAX_LEVELS) && + !(kvm_level_supports_block_mapping(level) && + range_included(&cur, range))); + + *range = cur; + + return 0; +} + static int host_stage2_idmap(u64 addr) { enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W; @@ -238,7 +281,7 @@ static int host_stage2_idmap(u64 addr) prot |= KVM_PGTABLE_PROT_X; hyp_spin_lock(&host_kvm.lock); - ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range); + ret = host_stage2_adjust_range(addr, &range); if (ret) goto unlock; diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 49d768b92997..4dff2ad39ee4 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1102,77 +1102,3 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz); pgt->pgd = NULL; } - -#define KVM_PTE_LEAF_S2_COMPAT_MASK (KVM_PTE_LEAF_ATTR_S2_PERMS | \ - KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \ - KVM_PTE_LEAF_ATTR_S2_IGNORED) - -static int stage2_check_permission_walker(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) -{ - kvm_pte_t old_attr, pte = *ptep, *new_attr = arg; - - /* - * Compatible mappings are either invalid and owned by the page-table - * owner (whose id is 0), or valid with matching permission attributes. - */ - if (kvm_pte_valid(pte)) { - old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK; - if (old_attr != *new_attr) - return -EEXIST; - } else if (pte) { - return -EEXIST; - } - - return 0; -} - -int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot, - struct kvm_mem_range *range) -{ - kvm_pte_t attr; - struct kvm_pgtable_walker check_perm_walker = { - .cb = stage2_check_permission_walker, - .flags = KVM_PGTABLE_WALK_LEAF, - .arg = &attr, - }; - u64 granule, start, end; - u32 level; - int ret; - - ret = stage2_set_prot_attr(pgt, prot, &attr); - if (ret) - return ret; - attr &= KVM_PTE_LEAF_S2_COMPAT_MASK; - - for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) { - granule = kvm_granule_size(level); - start = ALIGN_DOWN(addr, granule); - end = start + granule; - - if (!kvm_level_supports_block_mapping(level)) - continue; - - if (start < range->start || range->end < end) - continue; - - /* - * Check the presence of existing mappings with incompatible - * permissions within the current block range, and try one level - * deeper if one is found. - */ - ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker); - if (ret != -EEXIST) - break; - } - - if (!ret) { - range->start = start; - range->end = end; - } - - return ret; -} -- cgit v1.2.3 From 178cac08d588e7406a09351a992f57892d8d9cc9 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:34 +0100 Subject: KVM: arm64: Rename KVM_PTE_LEAF_ATTR_S2_IGNORED The ignored bits for both stage-1 and stage-2 page and block descriptors are in [55:58], so rename KVM_PTE_LEAF_ATTR_S2_IGNORED to make it applicable to both. And while at it, since these bits are more commonly known as 'software' bits, rename accordingly. Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-8-qperret@google.com --- arch/arm64/kvm/hyp/pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 4dff2ad39ee4..59a394d82de3 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -36,6 +36,8 @@ #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) +#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) + #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) @@ -44,8 +46,6 @@ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) -#define KVM_PTE_LEAF_ATTR_S2_IGNORED GENMASK(58, 55) - #define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56) #define KVM_MAX_OWNER_ID 1 -- cgit v1.2.3 From 8a0282c68121e53ab17413283cfed408a47e1a2a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:35 +0100 Subject: KVM: arm64: Don't overwrite software bits with owner id We will soon start annotating page-tables with new flags to track shared pages and such, and we will do so in valid mappings using software bits in the PTEs, as provided by the architecture. However, it is possible that we will need to use those flags to annotate invalid mappings as well in the future, similar to what we do to track page ownership in the host stage-2. In order to facilitate the annotation of invalid mappings with such flags, it would be preferable to re-use the same bits as for valid mappings (bits [58-55]), but these are currently used for ownership encoding. Since we have plenty of bits left to use in invalid mappings, move the ownership bits further down the PTE to avoid the conflict. Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-9-qperret@google.com --- arch/arm64/kvm/hyp/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 59a394d82de3..1ee1168ac32d 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -46,7 +46,7 @@ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56) +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) #define KVM_MAX_OWNER_ID 1 struct kvm_pgtable_walk_data { -- cgit v1.2.3 From b53846c5f279cb5329b82f19a7d313f02cb9d21c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:36 +0100 Subject: KVM: arm64: Tolerate re-creating hyp mappings to set software bits The current hypervisor stage-1 mapping code doesn't allow changing an existing valid mapping. Relax this condition by allowing changes that only target software bits, as that will soon be needed to annotate shared pages. Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-10-qperret@google.com --- arch/arm64/kvm/hyp/pgtable.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 1ee1168ac32d..2689fcb7901d 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -362,6 +362,21 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) return 0; } +static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new) +{ + /* + * Tolerate KVM recreating the exact same mapping, or changing software + * bits if the existing mapping was valid. + */ + if (old == new) + return false; + + if (!kvm_pte_valid(old)) + return true; + + return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW); +} + static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct hyp_map_data *data) { @@ -371,9 +386,8 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (!kvm_block_mapping_supported(addr, end, phys, level)) return false; - /* Tolerate KVM recreating the exact same mapping */ new = kvm_init_valid_leaf_pte(phys, data->attr, level); - if (old != new && !WARN_ON(kvm_pte_valid(old))) + if (hyp_pte_needs_update(old, new)) smp_store_release(ptep, new); data->phys += granule; -- cgit v1.2.3 From 5651311941105ca077d3ab74dd4a92e646ecf7fb Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:37 +0100 Subject: KVM: arm64: Enable forcing page-level stage-2 mappings Much of the stage-2 manipulation logic relies on being able to destroy block mappings if e.g. installing a smaller mapping in the range. The rationale for this behaviour is that stage-2 mappings can always be re-created lazily. However, this gets more complicated when the stage-2 page-table is used to store metadata about the underlying pages. In such cases, destroying a block mapping may lead to losing part of the state, and confuse the user of those metadata (such as the hypervisor in nVHE protected mode). To avoid this, introduce a callback function in the pgtable struct which is called during all map operations to determine whether the mappings can use blocks, or should be forced to page granularity. This is used by the hypervisor when creating the host stage-2 to force page-level mappings when using non-default protection attributes. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-11-qperret@google.com --- arch/arm64/include/asm/kvm_pgtable.h | 66 ++++++++++++++++++++++------------- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 34 +++++++++++++++--- arch/arm64/kvm/hyp/pgtable.c | 29 ++++++++++++--- 3 files changed, 94 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 8d6c710c1996..ca0c039547b5 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -115,25 +115,6 @@ enum kvm_pgtable_stage2_flags { KVM_PGTABLE_S2_IDMAP = BIT(1), }; -/** - * struct kvm_pgtable - KVM page-table. - * @ia_bits: Maximum input address size, in bits. - * @start_level: Level at which the page-table walk starts. - * @pgd: Pointer to the first top-level entry of the page-table. - * @mm_ops: Memory management callbacks. - * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. - */ -struct kvm_pgtable { - u32 ia_bits; - u32 start_level; - kvm_pte_t *pgd; - struct kvm_pgtable_mm_ops *mm_ops; - - /* Stage-2 only */ - struct kvm_s2_mmu *mmu; - enum kvm_pgtable_stage2_flags flags; -}; - /** * enum kvm_pgtable_prot - Page-table permissions and attributes. * @KVM_PGTABLE_PROT_X: Execute permission. @@ -149,11 +130,43 @@ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_DEVICE = BIT(3), }; -#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define KVM_PGTABLE_PROT_RW (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define KVM_PGTABLE_PROT_RWX (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X) + +#define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX +#define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW + +#define PAGE_HYP KVM_PGTABLE_PROT_RW #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) #define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) +typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, + enum kvm_pgtable_prot prot); + +/** + * struct kvm_pgtable - KVM page-table. + * @ia_bits: Maximum input address size, in bits. + * @start_level: Level at which the page-table walk starts. + * @pgd: Pointer to the first top-level entry of the page-table. + * @mm_ops: Memory management callbacks. + * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. + * @flags: Stage-2 page-table flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. + */ +struct kvm_pgtable { + u32 ia_bits; + u32 start_level; + kvm_pte_t *pgd; + struct kvm_pgtable_mm_ops *mm_ops; + + /* Stage-2 only */ + struct kvm_s2_mmu *mmu; + enum kvm_pgtable_stage2_flags flags; + kvm_pgtable_force_pte_cb_t force_pte_cb; +}; + /** * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid @@ -246,21 +259,24 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); /** - * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table. + * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. * @arch: Arch-specific KVM structure representing the guest virtual * machine. * @mm_ops: Memory management callbacks. * @flags: Stage-2 configuration flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, - struct kvm_pgtable_mm_ops *mm_ops, - enum kvm_pgtable_stage2_flags flags); +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags, + kvm_pgtable_force_pte_cb_t force_pte_cb); #define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \ - kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0) + __kvm_pgtable_stage2_init(pgt, arch, mm_ops, 0, NULL) /** * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2148d3968aa5..6fed6772c673 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -89,6 +89,8 @@ static void prepare_host_vtcr(void) id_aa64mmfr1_el1_sys_val, phys_shift); } +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot); + int kvm_host_prepare_stage2(void *pgt_pool_base) { struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; @@ -101,8 +103,9 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) if (ret) return ret; - ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch, - &host_kvm.mm_ops, KVM_HOST_S2_FLAGS); + ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch, + &host_kvm.mm_ops, KVM_HOST_S2_FLAGS, + host_stage2_force_pte_cb); if (ret) return ret; @@ -270,15 +273,36 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) return 0; } +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) +{ + /* + * Block mappings must be used with care in the host stage-2 as a + * kvm_pgtable_stage2_map() operation targeting a page in the range of + * an existing block will delete the block under the assumption that + * mappings in the rest of the block range can always be rebuilt lazily. + * That assumption is correct for the host stage-2 with RWX mappings + * targeting memory or RW mappings targeting MMIO ranges (see + * host_stage2_idmap() below which implements some of the host memory + * abort logic). However, this is not safe for any other mappings where + * the host stage-2 page-table is in fact the only place where this + * state is stored. In all those cases, it is safer to use page-level + * mappings, hence avoiding to lose the state because of side-effects in + * kvm_pgtable_stage2_map(). + */ + if (range_is_memory(addr, end)) + return prot != PKVM_HOST_MEM_PROT; + else + return prot != PKVM_HOST_MMIO_PROT; +} + static int host_stage2_idmap(u64 addr) { - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W; struct kvm_mem_range range; bool is_memory = find_mem_range(addr, &range); + enum kvm_pgtable_prot prot; int ret; - if (is_memory) - prot |= KVM_PGTABLE_PROT_X; + prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT; hyp_spin_lock(&host_kvm.lock); ret = host_stage2_adjust_range(addr, &range); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 2689fcb7901d..e25d829587b9 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -452,6 +452,8 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; pgt->mm_ops = mm_ops; pgt->mmu = NULL; + pgt->force_pte_cb = NULL; + return 0; } @@ -489,6 +491,9 @@ struct stage2_map_data { void *memcache; struct kvm_pgtable_mm_ops *mm_ops; + + /* Force mappings to page granularity */ + bool force_pte; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) @@ -602,6 +607,15 @@ static bool stage2_pte_executable(kvm_pte_t pte) return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } +static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level, + struct stage2_map_data *data) +{ + if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1))) + return false; + + return kvm_block_mapping_supported(addr, end, data->phys, level); +} + static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) @@ -611,7 +625,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - if (!kvm_block_mapping_supported(addr, end, phys, level)) + if (!stage2_leaf_mapping_allowed(addr, end, level, data)) return -E2BIG; if (kvm_phys_is_valid(phys)) @@ -655,7 +669,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, if (data->anchor) return 0; - if (!kvm_block_mapping_supported(addr, end, data->phys, level)) + if (!stage2_leaf_mapping_allowed(addr, end, level, data)) return 0; data->childp = kvm_pte_follow(*ptep, data->mm_ops); @@ -785,6 +799,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, .mmu = pgt->mmu, .memcache = mc, .mm_ops = pgt->mm_ops, + .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -816,6 +831,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .memcache = mc, .mm_ops = pgt->mm_ops, .owner_id = owner_id, + .force_pte = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -1057,9 +1073,11 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) return kvm_pgtable_walk(pgt, addr, size, &walker); } -int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, - struct kvm_pgtable_mm_ops *mm_ops, - enum kvm_pgtable_stage2_flags flags) + +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags, + kvm_pgtable_force_pte_cb_t force_pte_cb) { size_t pgd_sz; u64 vtcr = arch->vtcr; @@ -1077,6 +1095,7 @@ int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch pgt->mm_ops = mm_ops; pgt->mmu = &arch->mmu; pgt->flags = flags; + pgt->force_pte_cb = force_pte_cb; /* Ensure zeroed PGD pages are visible to the hardware walker */ dsb(ishst); -- cgit v1.2.3 From 4505e9b624cefafa4b75d8a28e72f32076c33375 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:38 +0100 Subject: KVM: arm64: Allow populating software bits Introduce infrastructure allowing to manipulate software bits in stage-1 and stage-2 page-tables using additional entries in the kvm_pgtable_prot enum. This is heavily inspired by Marc's implementation of a similar feature in the NV patch series, but adapted to allow stage-1 changes as well: https://lore.kernel.org/kvmarm/20210510165920.1913477-56-maz@kernel.org/ Suggested-by: Marc Zyngier Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-12-qperret@google.com --- arch/arm64/include/asm/kvm_pgtable.h | 12 +++++++++++- arch/arm64/kvm/hyp/pgtable.c | 5 +++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index ca0c039547b5..bfea573703d7 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -121,6 +121,10 @@ enum kvm_pgtable_stage2_flags { * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + * @KVM_PGTABLE_PROT_SW0: Software bit 0. + * @KVM_PGTABLE_PROT_SW1: Software bit 1. + * @KVM_PGTABLE_PROT_SW2: Software bit 2. + * @KVM_PGTABLE_PROT_SW3: Software bit 3. */ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_X = BIT(0), @@ -128,6 +132,11 @@ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_R = BIT(2), KVM_PGTABLE_PROT_DEVICE = BIT(3), + + KVM_PGTABLE_PROT_SW0 = BIT(55), + KVM_PGTABLE_PROT_SW1 = BIT(56), + KVM_PGTABLE_PROT_SW2 = BIT(57), + KVM_PGTABLE_PROT_SW3 = BIT(58), }; #define KVM_PGTABLE_PROT_RW (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) @@ -420,7 +429,8 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); * If there is a valid, leaf page-table entry used to translate @addr, then * relax the permissions in that entry according to the read, write and * execute permissions specified by @prot. No permissions are removed, and - * TLB invalidation is performed after updating the entry. + * TLB invalidation is performed after updating the entry. Software bits cannot + * be set or cleared using kvm_pgtable_stage2_relax_perms(). * * Return: 0 on success, negative error code on failure. */ diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index e25d829587b9..cff744136044 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -357,6 +357,7 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; @@ -558,6 +559,7 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; @@ -1025,6 +1027,9 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, u32 level; kvm_pte_t set = 0, clr = 0; + if (prot & KVM_PTE_LEAF_ATTR_HI_SW) + return -EINVAL; + if (prot & KVM_PGTABLE_PROT_R) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; -- cgit v1.2.3 From ec250a67ea8db6209918a389554cf3aec0395b1f Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:39 +0100 Subject: KVM: arm64: Add helpers to tag shared pages in SW bits We will soon start annotating shared pages in page-tables in nVHE protected mode. Define all the states in which a page can be (owned, shared and owned, shared and borrowed), and provide helpers allowing to convert this into SW bits annotations using the matching prot attributes. Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-13-qperret@google.com --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 9c227d87c36d..87b1690c439f 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -12,6 +12,32 @@ #include #include +/* + * SW bits 0-1 are reserved to track the memory ownership state of each page: + * 00: The page is owned exclusively by the page-table owner. + * 01: The page is owned by the page-table owner, but is shared + * with another entity. + * 10: The page is shared with, but not owned by the page-table owner. + * 11: Reserved for future use (lending). + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, + PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, +}; + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return prot & PKVM_PAGE_STATE_PROT_MASK; +} + struct host_kvm { struct kvm_arch arch; struct kvm_pgtable pgt; -- cgit v1.2.3 From 39257da0e04e5cdb1e4a3ca715dc3d949fe8b059 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:40 +0100 Subject: KVM: arm64: Expose host stage-2 manipulation helpers We will need to manipulate the host stage-2 page-table from outside mem_protect.c soon. Introduce two functions allowing this, and make them usable to users of mem_protect.h. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-14-qperret@google.com --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 ++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 87b1690c439f..0849ee8fa260 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -49,6 +49,8 @@ extern struct host_kvm host_kvm; int __pkvm_prot_finalize(void); int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end); +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); int kvm_host_prepare_stage2(void *pgt_pool_base); void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 6fed6772c673..f95a5a4aa09c 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -273,6 +273,22 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) return 0; } +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, + enum kvm_pgtable_prot prot) +{ + hyp_assert_lock_held(&host_kvm.lock); + + return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); +} + +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +{ + hyp_assert_lock_held(&host_kvm.lock); + + return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, + addr, size, &host_s2_pool, owner_id); +} + static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) { /* @@ -309,7 +325,7 @@ static int host_stage2_idmap(u64 addr) if (ret) goto unlock; - ret = host_stage2_try(__host_stage2_idmap, range.start, range.end, prot); + ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot); unlock: hyp_spin_unlock(&host_kvm.lock); -- cgit v1.2.3 From 2d77e238badb022adb364332b7d6a1d627f77145 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:41 +0100 Subject: KVM: arm64: Expose pkvm_hyp_id Allow references to the hypervisor's owner id from outside mem_protect.c. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-15-qperret@google.com --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 ++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 0849ee8fa260..23316a021880 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -46,6 +46,8 @@ struct host_kvm { }; extern struct host_kvm host_kvm; +extern const u8 pkvm_hyp_id; + int __pkvm_prot_finalize(void); int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index f95a5a4aa09c..ee255171945c 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -31,7 +31,7 @@ static struct hyp_pool host_s2_pool; u64 id_aa64mmfr0_el1_sys_val; u64 id_aa64mmfr1_el1_sys_val; -static const u8 pkvm_hyp_id = 1; +const u8 pkvm_hyp_id = 1; static void *host_s2_zalloc_pages_exact(size_t size) { -- cgit v1.2.3 From e009dce1292c37cf8ee7c33e0887ad3c642f980f Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:42 +0100 Subject: KVM: arm64: Introduce addr_is_memory() Introduce a helper usable in nVHE protected mode to check whether a physical address is in a RAM region or not. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-16-qperret@google.com --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + arch/arm64/kvm/hyp/nvhe/mem_protect.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 23316a021880..49db0ec5a606 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -51,6 +51,7 @@ extern const u8 pkvm_hyp_id; int __pkvm_prot_finalize(void); int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end); +bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); int kvm_host_prepare_stage2(void *pgt_pool_base); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index ee255171945c..cb023d31666e 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -197,6 +197,13 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) return false; } +bool addr_is_memory(phys_addr_t phys) +{ + struct kvm_mem_range range; + + return find_mem_range(phys, &range); +} + static bool range_is_memory(u64 start, u64 end) { struct kvm_mem_range r1, r2; -- cgit v1.2.3 From 9024b3d0069ab4b8ef70cf55f0ee09e61f3a0747 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:43 +0100 Subject: KVM: arm64: Enable retrieving protections attributes of PTEs Introduce helper functions in the KVM stage-2 and stage-1 page-table manipulation library allowing to retrieve the enum kvm_pgtable_prot of a PTE. This will be useful to implement custom walkers outside of pgtable.c. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-17-qperret@google.com --- arch/arm64/include/asm/kvm_pgtable.h | 20 +++++++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 37 ++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index bfea573703d7..027783829584 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -507,4 +507,24 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, */ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, kvm_pte_t *ptep, u32 *level); + +/** + * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a + * stage-2 Page-Table Entry. + * @pte: Page-table entry + * + * Return: protection attributes of the page-table entry in the enum + * kvm_pgtable_prot format. + */ +enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte); + +/** + * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1 + * Page-Table Entry. + * @pte: Page-table entry + * + * Return: protection attributes of the page-table entry in the enum + * kvm_pgtable_prot format. + */ +enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte); #endif /* __ARM64_KVM_PGTABLE_H__ */ diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index cff744136044..f8ceebe4982e 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -363,6 +363,26 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) return 0; } +enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + u32 ap; + + if (!kvm_pte_valid(pte)) + return prot; + + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) + prot |= KVM_PGTABLE_PROT_X; + + ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); + if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) + prot |= KVM_PGTABLE_PROT_R; + else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) + prot |= KVM_PGTABLE_PROT_RW; + + return prot; +} + static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new) { /* @@ -565,6 +585,23 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p return 0; } +enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + + if (!kvm_pte_valid(pte)) + return prot; + + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) + prot |= KVM_PGTABLE_PROT_R; + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) + prot |= KVM_PGTABLE_PROT_W; + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) + prot |= KVM_PGTABLE_PROT_X; + + return prot; +} + static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) { if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) -- cgit v1.2.3 From 2c50166c62ba7f3c23c1bbdbb9324db462ddc97b Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:44 +0100 Subject: KVM: arm64: Mark host bss and rodata section as shared As the hypervisor maps the host's .bss and .rodata sections in its stage-1, make sure to tag them as shared in hyp and host page-tables. But since the hypervisor relies on the presence of these mappings, we cannot let the host in complete control of the memory regions -- it must not unshare or donate them to another entity for example. To prevent this, let's transfer the ownership of those ranges to the hypervisor itself, and share the pages back with the host. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-18-qperret@google.com --- arch/arm64/kvm/hyp/nvhe/setup.c | 82 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 74 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 0b574d106519..57c27846320f 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -58,6 +58,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, { void *start, *end, *virt = hyp_phys_to_virt(phys); unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; + enum kvm_pgtable_prot prot; int ret, i; /* Recreate the hyp page-table using the early page allocator */ @@ -83,10 +84,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO); - if (ret) - return ret; - ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); if (ret) return ret; @@ -95,10 +92,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO); - if (ret) - return ret; - ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP); if (ret) return ret; @@ -117,6 +110,24 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } + /* + * Map the host's .bss and .rodata sections RO in the hypervisor, but + * transfer the ownership from the host to the hypervisor itself to + * make sure it can't be donated or shared with another entity. + * + * The ownership transition requires matching changes in the host + * stage-2. This will be done later (see finalize_host_mappings()) once + * the hyp_vmemmap is addressable. + */ + prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); + ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot); + if (ret) + return ret; + return 0; } @@ -148,6 +159,57 @@ static void hpool_put_page(void *addr) hyp_put_page(&hpool, addr); } +static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + enum kvm_pgtable_prot prot; + enum pkvm_page_state state; + kvm_pte_t pte = *ptep; + phys_addr_t phys; + + if (!kvm_pte_valid(pte)) + return 0; + + if (level != (KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + phys = kvm_pte_to_phys(pte); + if (!addr_is_memory(phys)) + return 0; + + /* + * Adjust the host stage-2 mappings to match the ownership attributes + * configured in the hypervisor stage-1. + */ + state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); + switch (state) { + case PKVM_PAGE_OWNED: + return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id); + case PKVM_PAGE_SHARED_OWNED: + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); + break; + case PKVM_PAGE_SHARED_BORROWED: + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + break; + default: + return -EINVAL; + } + + return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); +} + +static int finalize_host_mappings(void) +{ + struct kvm_pgtable_walker walker = { + .cb = finalize_host_mappings_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), &walker); +} + void __noreturn __pkvm_init_finalise(void) { struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); @@ -167,6 +229,10 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; + ret = finalize_host_mappings(); + if (ret) + goto out; + pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_page = hyp_zalloc_hyp_page, .phys_to_virt = hyp_phys_to_virt, -- cgit v1.2.3 From ad0e0139a8e163245d8f44ab4f6ec3bc9b08034d Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:45 +0100 Subject: KVM: arm64: Remove __pkvm_mark_hyp Now that we mark memory owned by the hypervisor in the host stage-2 during __pkvm_init(), we no longer need to rely on the host to explicitly mark the hyp sections later on. Remove the __pkvm_mark_hyp() hypercall altogether. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-19-qperret@google.com --- arch/arm64/include/asm/kvm_asm.h | 3 +- arch/arm64/kvm/arm.c | 46 --------------------------- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 - arch/arm64/kvm/hyp/nvhe/hyp-main.c | 9 ------ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 19 ----------- 5 files changed, 1 insertion(+), 77 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9f0bf2109be7..432a9ea1f02e 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -63,8 +63,7 @@ #define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 -#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 -#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21 +#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e9a2b8f27792..2f378482471b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1954,57 +1954,11 @@ static void _kvm_host_prot_finalize(void *discard) WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)); } -static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) -{ - return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end); -} - -#define pkvm_mark_hyp_section(__section) \ - pkvm_mark_hyp(__pa_symbol(__section##_start), \ - __pa_symbol(__section##_end)) - static int finalize_hyp_mode(void) { - int cpu, ret; - if (!is_protected_kvm_enabled()) return 0; - ret = pkvm_mark_hyp_section(__hyp_idmap_text); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_text); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_rodata); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_bss); - if (ret) - return ret; - - ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size); - if (ret) - return ret; - - for_each_possible_cpu(cpu) { - phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]); - phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order()); - - ret = pkvm_mark_hyp(start, end); - if (ret) - return ret; - - start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu)); - end = start + PAGE_SIZE; - ret = pkvm_mark_hyp(start, end); - if (ret) - return ret; - } - /* * Flip the static key upfront as that may no longer be possible * once the host stage 2 is installed. diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 49db0ec5a606..0118527b07b0 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -49,7 +49,6 @@ extern struct host_kvm host_kvm; extern const u8 pkvm_hyp_id; int __pkvm_prot_finalize(void); -int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 1632f001f4ed..7900d5b66ba3 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -163,14 +163,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) { cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } - -static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(phys_addr_t, start, host_ctxt, 1); - DECLARE_REG(phys_addr_t, end, host_ctxt, 2); - - cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end); -} typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x @@ -196,7 +188,6 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_create_mappings), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_prot_finalize), - HANDLE_FUNC(__pkvm_mark_hyp), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index cb023d31666e..2991dc6996b9 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -339,25 +339,6 @@ unlock: return ret; } -int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) -{ - int ret; - - /* - * host_stage2_unmap_dev_all() currently relies on MMIO mappings being - * non-persistent, so don't allow changing page ownership in MMIO range. - */ - if (!range_is_memory(start, end)) - return -EINVAL; - - hyp_spin_lock(&host_kvm.lock); - ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, - start, end - start, &host_s2_pool, pkvm_hyp_id); - hyp_spin_unlock(&host_kvm.lock); - - return ret != -EAGAIN ? ret : 0; -} - void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) { struct kvm_vcpu_fault_info fault; -- cgit v1.2.3 From f9370010e92638f66473baf342e19de940403362 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:46 +0100 Subject: KVM: arm64: Refactor protected nVHE stage-1 locking Refactor the hypervisor stage-1 locking in nVHE protected mode to expose a new pkvm_create_mappings_locked() function. This will be used in later patches to allow walking and changing the hypervisor stage-1 without releasing the lock. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-20-qperret@google.com --- arch/arm64/kvm/hyp/include/nvhe/mm.h | 1 + arch/arm64/kvm/hyp/nvhe/mm.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 8ec3a5a7744b..c76d7136ed9b 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -23,6 +23,7 @@ int hyp_map_vectors(void); int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); int __pkvm_create_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot); unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index a8efdf0f9003..6fbe8e8030f6 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -67,13 +67,15 @@ out: return addr; } -int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot) { unsigned long start = (unsigned long)from; unsigned long end = (unsigned long)to; unsigned long virt_addr; phys_addr_t phys; + hyp_assert_lock_held(&pkvm_pgd_lock); + start = start & PAGE_MASK; end = PAGE_ALIGN(end); @@ -81,7 +83,8 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) int err; phys = hyp_virt_to_phys((void *)virt_addr); - err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot); + err = kvm_pgtable_hyp_map(&pkvm_pgtable, virt_addr, PAGE_SIZE, + phys, prot); if (err) return err; } @@ -89,6 +92,17 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return 0; } +int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +{ + int ret; + + hyp_spin_lock(&pkvm_pgd_lock); + ret = pkvm_create_mappings_locked(from, to, prot); + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +} + int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back) { unsigned long start, end; -- cgit v1.2.3 From 66c57edd3bc79e3527daaae8123f72ecd1e3fa25 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:47 +0100 Subject: KVM: arm64: Restrict EL2 stage-1 changes in protected mode The host kernel is currently able to change EL2 stage-1 mappings without restrictions thanks to the __pkvm_create_mappings() hypercall. But in a world where the host is no longer part of the TCB, this clearly poses a problem. To fix this, introduce a new hypercall to allow the host to share a physical memory page with the hypervisor, and remove the __pkvm_create_mappings() variant. The new hypercall implements ownership and permission checks before allowing the sharing operation, and it annotates the shared page in the hypervisor stage-1 and host stage-2 page-tables. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-21-qperret@google.com --- arch/arm64/include/asm/kvm_asm.h | 2 +- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 11 ++-- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 88 +++++++++++++++++++++++++++ arch/arm64/kvm/mmu.c | 28 +++++++-- 5 files changed, 118 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 432a9ea1f02e..aed2aa61766a 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -59,7 +59,7 @@ #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 #define __KVM_HOST_SMCCC_FUNC___pkvm_init 15 -#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16 +#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16 #define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 0118527b07b0..03e604f842e2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -49,6 +49,7 @@ extern struct host_kvm host_kvm; extern const u8 pkvm_hyp_id; int __pkvm_prot_finalize(void); +int __pkvm_host_share_hyp(u64 pfn); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 7900d5b66ba3..2da6aa8da868 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -140,14 +140,11 @@ static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot); } -static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(unsigned long, start, host_ctxt, 1); - DECLARE_REG(unsigned long, size, host_ctxt, 2); - DECLARE_REG(unsigned long, phys, host_ctxt, 3); - DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); + DECLARE_REG(u64, pfn, host_ctxt, 1); - cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot); + cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn); } static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) @@ -185,7 +182,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__vgic_v3_restore_aprs), HANDLE_FUNC(__pkvm_init), HANDLE_FUNC(__pkvm_cpu_set_vector), - HANDLE_FUNC(__pkvm_create_mappings), + HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_prot_finalize), }; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2991dc6996b9..8165390d3ec9 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -339,6 +339,94 @@ unlock: return ret; } +static inline bool check_prot(enum kvm_pgtable_prot prot, + enum kvm_pgtable_prot required, + enum kvm_pgtable_prot denied) +{ + return (prot & (required | denied)) == required; +} + +int __pkvm_host_share_hyp(u64 pfn) +{ + phys_addr_t addr = hyp_pfn_to_phys(pfn); + enum kvm_pgtable_prot prot, cur; + void *virt = __hyp_va(addr); + enum pkvm_page_state state; + kvm_pte_t pte; + int ret; + + if (!addr_is_memory(addr)) + return -EINVAL; + + hyp_spin_lock(&host_kvm.lock); + hyp_spin_lock(&pkvm_pgd_lock); + + ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL); + if (ret) + goto unlock; + if (!pte) + goto map_shared; + + /* + * Check attributes in the host stage-2 PTE. We need the page to be: + * - mapped RWX as we're sharing memory; + * - not borrowed, as that implies absence of ownership. + * Otherwise, we can't let it got through + */ + cur = kvm_pgtable_stage2_pte_prot(pte); + prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED); + if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) { + ret = -EPERM; + goto unlock; + } + + state = pkvm_getstate(cur); + if (state == PKVM_PAGE_OWNED) + goto map_shared; + + /* + * Tolerate double-sharing the same page, but this requires + * cross-checking the hypervisor stage-1. + */ + if (state != PKVM_PAGE_SHARED_OWNED) { + ret = -EPERM; + goto unlock; + } + + ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL); + if (ret) + goto unlock; + + /* + * If the page has been shared with the hypervisor, it must be + * already mapped as SHARED_BORROWED in its stage-1. + */ + cur = kvm_pgtable_hyp_pte_prot(pte); + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + if (!check_prot(cur, prot, ~prot)) + ret = EPERM; + goto unlock; + +map_shared: + /* + * If the page is not yet shared, adjust mappings in both page-tables + * while both locks are held. + */ + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot); + BUG_ON(ret); + + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot); + BUG_ON(ret); + +unlock: + hyp_spin_unlock(&pkvm_pgd_lock); + hyp_spin_unlock(&host_kvm.lock); + + return ret; +} + void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) { struct kvm_vcpu_fault_info fault; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0625bf2353c2..cbab146cda6a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -259,10 +259,8 @@ static int __create_hyp_mappings(unsigned long start, unsigned long size, { int err; - if (!kvm_host_owns_hyp_mappings()) { - return kvm_call_hyp_nvhe(__pkvm_create_mappings, - start, size, phys, prot); - } + if (WARN_ON(!kvm_host_owns_hyp_mappings())) + return -EINVAL; mutex_lock(&kvm_hyp_pgd_mutex); err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); @@ -282,6 +280,21 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) } } +static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end) +{ + phys_addr_t addr; + int ret; + + for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) { + ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, + __phys_to_pfn(addr)); + if (ret) + return ret; + } + + return 0; +} + /** * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode * @from: The virtual kernel start address of the range @@ -302,6 +315,13 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) if (is_kernel_in_hyp_mode()) return 0; + if (!kvm_host_owns_hyp_mappings()) { + if (WARN_ON(prot != PAGE_HYP)) + return -EPERM; + return pkvm_share_hyp(kvm_kaddr_to_phys(from), + kvm_kaddr_to_phys(to)); + } + start = start & PAGE_MASK; end = PAGE_ALIGN(end); -- cgit v1.2.3 From 64a80fb766f9a91e26930bfc56d8e7c12425df12 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 9 Aug 2021 16:24:48 +0100 Subject: KVM: arm64: Make __pkvm_create_mappings static The __pkvm_create_mappings() function is no longer used outside of nvhe/mm.c, make it static. Signed-off-by: Quentin Perret Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210809152448.1810400-22-qperret@google.com --- arch/arm64/kvm/hyp/include/nvhe/mm.h | 2 -- arch/arm64/kvm/hyp/nvhe/mm.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index c76d7136ed9b..c9a8f535212e 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -24,8 +24,6 @@ int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); -int __pkvm_create_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot); unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 6fbe8e8030f6..2fabeceb889a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -23,8 +23,8 @@ u64 __io_map_base; struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; unsigned int hyp_memblock_nr; -int __pkvm_create_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot) +static int __pkvm_create_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) { int err; -- cgit v1.2.3 From 6fadc1241c33fe0228c94bc6a1aa6c1da8872e8b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 11 Aug 2021 08:57:06 +0530 Subject: KVM: arm64: perf: Replace '0xf' instances with ID_AA64DFR0_PMUVER_IMP_DEF ID_AA64DFR0_PMUVER_IMP_DEF which indicate implementation defined PMU, never actually gets used although there are '0xf' instances scattered all around. Just do the macro replacement to improve readability. Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Peter Zijlstra Cc: Marc Zyngier Cc: linux-perf-users@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: kvmarm@lists.cs.columbia.edu Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Signed-off-by: Marc Zyngier --- arch/arm64/kvm/perf.c | 2 +- arch/arm64/kvm/pmu-emul.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index 151c31fb9860..f9bb3b14130e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -50,7 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { - if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled()) + if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled()) static_branch_enable(&kvm_arm_pmu_available); return perf_register_guest_info_callbacks(&kvm_guest_cbs); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f33825c995cb..60f89bdbeebb 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -745,7 +745,7 @@ int kvm_pmu_probe_pmuver(void) struct perf_event_attr attr = { }; struct perf_event *event; struct arm_pmu *pmu; - int pmuver = 0xf; + int pmuver = ID_AA64DFR0_PMUVER_IMP_DEF; /* * Create a dummy event that only counts user cycles. As we'll never @@ -770,7 +770,7 @@ int kvm_pmu_probe_pmuver(void) if (IS_ERR(event)) { pr_err_once("kvm: pmu event creation failed %ld\n", PTR_ERR(event)); - return 0xf; + return ID_AA64DFR0_PMUVER_IMP_DEF; } if (event->pmu) { @@ -923,7 +923,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!vcpu->kvm->arch.pmuver) vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver(); - if (vcpu->kvm->arch.pmuver == 0xf) + if (vcpu->kvm->arch.pmuver == ID_AA64DFR0_PMUVER_IMP_DEF) return -ENODEV; switch (attr->attr) { -- cgit v1.2.3 From b31578f627177bda5c16894e3170a7a6a1236136 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 10 Aug 2021 09:59:42 +0530 Subject: arm64/mm: Define ID_AA64MMFR0_TGRAN_2_SHIFT Streamline the Stage-2 TGRAN value extraction from ID_AA64MMFR0 register by adding a page size agnostic ID_AA64MMFR0_TGRAN_2_SHIFT. This is similar to the existing Stage-1 TGRAN shift i.e ID_AA64MMFR0_TGRAN_SHIFT. Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: linux-arm-kernel@lists.infradead.org Cc: kvmarm@lists.cs.columbia.edu Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Acked-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1628569782-30213-1-git-send-email-anshuman.khandual@arm.com --- arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kvm/reset.c | 17 ++--------------- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 7b9c3acba684..943d31d92b5b 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1028,14 +1028,17 @@ #if defined(CONFIG_ARM64_4K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN4_2_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN4_SUPPORTED #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0x7 #elif defined(CONFIG_ARM64_16K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN16_2_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN16_SUPPORTED #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0xF #elif defined(CONFIG_ARM64_64K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN64_2_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN64_SUPPORTED #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0x7 #endif diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index cba7872d69a8..20588220fe66 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -311,7 +311,7 @@ u32 get_kvm_ipa_limit(void) int kvm_set_ipa_limit(void) { - unsigned int parange, tgran_2; + unsigned int parange; u64 mmfr0; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); @@ -322,20 +322,7 @@ int kvm_set_ipa_limit(void) * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at * Stage-2. If not, things will stop very quickly. */ - switch (PAGE_SIZE) { - default: - case SZ_4K: - tgran_2 = ID_AA64MMFR0_TGRAN4_2_SHIFT; - break; - case SZ_16K: - tgran_2 = ID_AA64MMFR0_TGRAN16_2_SHIFT; - break; - case SZ_64K: - tgran_2 = ID_AA64MMFR0_TGRAN64_2_SHIFT; - break; - } - - switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) { + switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_TGRAN_2_SHIFT)) { case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE: kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); return -EINVAL; -- cgit v1.2.3 From 5e5df9571c319fb107d7a523cc96fcc99961ee70 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 11 Aug 2021 16:41:15 +0530 Subject: KVM: arm64: Restrict IPA size to maximum 48 bits on 4K and 16K page size Even though ID_AA64MMFR0.PARANGE reports 52 bit PA size support, it cannot be enabled as guest IPA size on 4K or 16K page size configurations. Hence kvm_ipa_limit must be restricted to 48 bits. This change achieves required IPA capping. Before the commit c9b69a0cf0b4 ("KVM: arm64: Don't constrain maximum IPA size based on host configuration"), the problem here would have been just latent via PHYS_MASK_SHIFT (which earlier in turn capped kvm_ipa_limit), which remains capped at 48 bits on 4K and 16K configs. Cc: Marc Zyngier Cc: James Morse Cc: Alexandru Elisei Cc: Suzuki K Poulose Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: kvmarm@lists.cs.columbia.edu Cc: linux-kernel@vger.kernel.org Fixes: c9b69a0cf0b4 ("KVM: arm64: Don't constrain maximum IPA size based on host configuration") Signed-off-by: Anshuman Khandual Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1628680275-16578-1-git-send-email-anshuman.khandual@arm.com --- arch/arm64/kvm/reset.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 20588220fe66..18ffc6ad67b8 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -317,6 +317,14 @@ int kvm_set_ipa_limit(void) mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_PARANGE_SHIFT); + /* + * IPA size beyond 48 bits could not be supported + * on either 4K or 16K page size. Hence let's cap + * it to 48 bits, in case it's reported as larger + * on the system. + */ + if (PAGE_SIZE != SZ_64K) + parange = min(parange, (unsigned int)ID_AA64MMFR0_PARANGE_48); /* * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at -- cgit v1.2.3 From 12593568d7319c34c72038ea799ab1bd0f0eb01c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 11 Aug 2021 18:36:25 +0100 Subject: KVM: arm64: Return -EPERM from __pkvm_host_share_hyp() Fix the error code returned by __pkvm_host_share_hyp() when the host attempts to share with EL2 a page that has already been shared with another entity. Reported-by: Will Deacon Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210811173630.2536721-1-qperret@google.com --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 8165390d3ec9..6ec695311498 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -404,7 +404,7 @@ int __pkvm_host_share_hyp(u64 pfn) cur = kvm_pgtable_hyp_pte_prot(pte); prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); if (!check_prot(cur, prot, ~prot)) - ret = EPERM; + ret = -EPERM; goto unlock; map_shared: -- cgit v1.2.3 From 34e9f860071f717965f1816171a11eaf2d378ee0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 10 Aug 2021 01:43:05 +0800 Subject: KVM: X86: Remove unneeded KVM_DEBUGREG_RELOAD Commit ae561edeb421 ("KVM: x86: DR0-DR3 are not clear on reset") added code to ensure eff_db are updated when they're modified through non-standard paths. But there is no reason to also update hardware DRs unless hardware breakpoints are active or DR exiting is disabled, and in those cases updating hardware is handled by KVM_DEBUGREG_WONT_EXIT and KVM_DEBUGREG_BP_ENABLED. KVM_DEBUGREG_RELOAD just causes unnecesarry load of hardware DRs and is better to be removed. Suggested-by: Sean Christopherson Signed-off-by: Lai Jiangshan Message-Id: <20210809174307.145263-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 3 --- 2 files changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 07d2652c6d61..5e35334980a6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -523,7 +523,6 @@ struct kvm_pmu_ops; enum { KVM_DEBUGREG_BP_ENABLED = 1, KVM_DEBUGREG_WONT_EXIT = 2, - KVM_DEBUGREG_RELOAD = 4, }; struct kvm_mtrr_range { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3cedc7cc132a..eca5f005a7fc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1181,7 +1181,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; } } @@ -9604,7 +9603,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); set_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } @@ -9634,7 +9632,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } /* -- cgit v1.2.3 From 375e28ffc0cf4fc48862c03994ec4a93254cf1c6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Aug 2021 06:07:06 -0400 Subject: KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT Commit c77fb5fe6f03 ("KVM: x86: Allow the guest to run with dirty debug registers") allows the guest accessing to DRs without exiting when KVM_DEBUGREG_WONT_EXIT and we need to ensure that they are synchronized on entry to the guest---including DR6 that was not synced before the commit. But the commit sets the hardware DR6 not only when KVM_DEBUGREG_WONT_EXIT, but also when KVM_DEBUGREG_BP_ENABLED. The second case is unnecessary and just leads to a more case which leaks stale DR6 to the host which has to be resolved by unconditionally reseting DR6 in kvm_arch_vcpu_put(). Even if KVM_DEBUGREG_WONT_EXIT, however, setting the host DR6 only matters on VMX because SVM always uses the DR6 value from the VMCB. So move this line to vmx.c and make it conditional on KVM_DEBUGREG_WONT_EXIT. Reported-by: Lai Jiangshan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++++ arch/x86/kvm/x86.c | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ae8e62df16dd..7c1c8ef1a286 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6625,6 +6625,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + set_debugreg(vcpu->arch.dr6, 6); + /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eca5f005a7fc..15ac8645e532 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9602,7 +9602,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); - set_debugreg(vcpu->arch.dr6, 6); } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } -- cgit v1.2.3 From 1ccb6f983a063e794daeb03f90b3517f87dfae8f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Aug 2021 06:11:35 -0400 Subject: KVM: VMX: Reset DR6 only when KVM_DEBUGREG_WONT_EXIT The commit efdab992813fb ("KVM: x86: fix escape of guest dr6 to the host") fixed a bug by resetting DR6 unconditionally when the vcpu being scheduled out. But writing to debug registers is slow, and it can be visible in perf results sometimes, even if neither the host nor the guest activate breakpoints. Since KVM_DEBUGREG_WONT_EXIT on Intel processors is the only case where DR6 gets the guest value, and it never happens at all on SVM, the register can be cleared in vmx.c right after reading it. Reported-by: Lai Jiangshan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7c1c8ef1a286..123ecca91c27 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5110,6 +5110,12 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); + + /* + * exc_debug expects dr6 to be cleared after it runs, avoid that it sees + * a stale dr6 from the guest. + */ + set_debugreg(DR6_RESERVED, 6); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15ac8645e532..70d42b50199a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4310,12 +4310,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); - /* - * If userspace has set any breakpoints or watchpoints, dr6 is restored - * on every vmexit, but if not, we might have a stale dr6 from the - * guest. do_debug expects dr6 to be cleared after it runs, do the same. - */ - set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 389ab25216c9d09e0d335e764eeeb84c2089614f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Aug 2021 10:19:50 -0700 Subject: KVM: nVMX: Pull KVM L0's desired controls directly from vmcs01 When preparing controls for vmcs02, grab KVM's desired controls from vmcs01's shadow state instead of recalculating the controls from scratch, or in the secondary execution controls, instead of using the dedicated cache. Calculating secondary exec controls is eye-poppingly expensive due to the guest CPUID checks, hence the dedicated cache, but the other calculations aren't exactly free either. Explicitly clear several bits (x2APIC, DESC exiting, and load EFER on exit) as appropriate as they may be set in vmcs01, whereas the previous implementation relied on dynamic bits being cleared in the calculator. Intentionally propagate VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL from vmcs01 to vmcs02. Whether or not PERF_GLOBAL_CTRL is loaded depends on whether or not perf itself is active, so unless perf stops between the exit from L1 and entry to L2, vmcs01 will hold the desired value. This is purely an optimization as atomic_switch_perf_msrs() will set/clear the control as needed at VM-Enter, i.e. it avoids two extra VMWRITEs in the case where perf is active (versus starting with the bits clear in vmcs02, which was the previous behavior). Cc: Zeng Guang Signed-off-by: Sean Christopherson Message-Id: <20210810171952.2758100-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 25 ++++++++++++++++--------- arch/x86/kvm/vmx/vmx.h | 6 +++++- 2 files changed, 21 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0d0dd6580cfd..264a9f4c9179 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2170,7 +2170,8 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, } } -static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, + struct vmcs12 *vmcs12) { u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); @@ -2181,7 +2182,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * PIN CONTROLS */ - exec_control = vmx_pin_based_exec_ctrl(vmx); + exec_control = __pin_controls_get(vmcs01); exec_control |= (vmcs12->pin_based_vm_exec_control & ~PIN_BASED_VMX_PREEMPTION_TIMER); @@ -2197,7 +2198,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * EXEC CONTROLS */ - exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control = __exec_controls_get(vmcs01); /* L0's desires */ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; @@ -2234,10 +2235,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * SECONDARY EXEC CONTROLS */ if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmx->secondary_exec_control; + exec_control = __secondary_exec_controls_get(vmcs01); /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_XSAVES | @@ -2245,7 +2247,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_TSC_SCALING); + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_DESC); + if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -2285,8 +2289,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). */ - exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & - ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; + exec_control = __vm_entry_controls_get(vmcs01); + exec_control |= vmcs12->vm_entry_controls; + exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; @@ -2302,9 +2307,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ - exec_control = vmx_vmexit_ctrl(); + exec_control = __vm_exit_controls_get(vmcs01); if (cpu_has_load_ia32_efer() && guest_efer != host_efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; + else + exec_control &= ~VM_EXIT_LOAD_IA32_EFER; vm_exit_controls_set(vmx, exec_control); /* @@ -3347,7 +3354,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); - prepare_vmcs02_early(vmx, vmcs12); + prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); if (from_vmentry) { if (unlikely(!nested_get_vmcs12_pages(vcpu))) { diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4f151175d45a..414b440de9ac 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -418,9 +418,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ vmx->loaded_vmcs->controls_shadow.lname = val; \ } \ } \ +static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ { \ - return vmx->loaded_vmcs->controls_shadow.lname; \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ } \ static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ { \ -- cgit v1.2.3 From b6247686b7571003eca2305b2096f59e1e1ce976 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Aug 2021 10:19:51 -0700 Subject: KVM: VMX: Drop caching of KVM's desired sec exec controls for vmcs01 Remove the secondary execution controls cache now that it's effectively dead code; it is only read immediately after it is written. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210810171952.2758100-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 20 ++++++++------------ arch/x86/kvm/vmx/vmx.h | 3 +-- 2 files changed, 9 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 123ecca91c27..f2622dd335ce 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4212,7 +4212,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) -static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) +u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; @@ -4298,7 +4298,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; - vmx->secondary_exec_control = exec_control; + return exec_control; } #define VMX_XSS_EXIT_BITMAP 0 @@ -4322,10 +4322,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) exec_controls_set(vmx, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - secondary_exec_controls_set(vmx, vmx->secondary_exec_control); - } + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -6992,7 +6990,7 @@ exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } -static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) +static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) { /* * These bits in the secondary execution controls field @@ -7006,7 +7004,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC; - u32 new_ctl = vmx->secondary_exec_control; u32 cur_ctl = secondary_exec_controls_get(vmx); secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); @@ -7151,10 +7148,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_setup_uret_msrs(vmx); - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - vmcs_set_secondary_exec_control(vmx); - } + if (cpu_has_secondary_exec_ctrls()) + vmcs_set_secondary_exec_control(vmx, + vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 414b440de9ac..2bd07867e9da 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -263,8 +263,6 @@ struct vcpu_vmx { u64 spec_ctrl; u32 msr_ia32_umwait_control; - u32 secondary_exec_control; - /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -477,6 +475,7 @@ static inline u32 vmx_vmexit_ctrl(void) } u32 vmx_exec_control(struct vcpu_vmx *vmx); +u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx); u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx); static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) -- cgit v1.2.3 From 2fba4fc155280727b4997c6ee86f24c260dd9155 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Aug 2021 10:19:52 -0700 Subject: KVM: VMX: Hide VMCS control calculators in vmx.c Now that nested VMX pulls KVM's desired VMCS controls from vmcs01 instead of re-calculating on the fly, bury the helpers that do the calcluations in vmx.c. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210810171952.2758100-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 30 +++++++++++++++++++++++++++--- arch/x86/kvm/vmx/vmx.h | 26 -------------------------- 2 files changed, 27 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f2622dd335ce..cd913100b300 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4095,7 +4095,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; @@ -4111,6 +4111,30 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) return pin_based_exec_ctrl; } +static u32 vmx_vmentry_ctrl(void) +{ + u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; + + if (vmx_pt_mode_is_system()) + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmentry_ctrl & + ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); +} + +static u32 vmx_vmexit_ctrl(void) +{ + u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + + if (vmx_pt_mode_is_system()) + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmexit_ctrl & + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); +} + static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4130,7 +4154,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap_x2apic(vcpu); } -u32 vmx_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -4212,7 +4236,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) -u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 2bd07867e9da..4858c5fd95f2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -452,32 +452,6 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = 0; } -static inline u32 vmx_vmentry_ctrl(void) -{ - u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; - if (vmx_pt_mode_is_system()) - vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | - VM_ENTRY_LOAD_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmentry_ctrl & - ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); -} - -static inline u32 vmx_vmexit_ctrl(void) -{ - u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; - if (vmx_pt_mode_is_system()) - vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | - VM_EXIT_CLEAR_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmexit_ctrl & - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); -} - -u32 vmx_exec_control(struct vcpu_vmx *vmx); -u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx); -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx); - static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); -- cgit v1.2.3 From ad0577c375299a2cc426913c141086c0e9033c78 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 9 Aug 2021 10:39:54 -0700 Subject: KVM: x86: Kill off __ex() and __kvm_handle_fault_on_reboot() Remove the __kvm_handle_fault_on_reboot() and __ex() macros now that all VMX and SVM instructions use asm goto to handle the fault (or in the case of VMREAD, completely custom logic). Drop kvm_spurious_fault()'s asmlinkage annotation as __kvm_handle_fault_on_reboot() was the only flow that invoked it from assembly code. Cc: Uros Bizjak Cc: Like Xu Signed-off-by: Sean Christopherson Message-Id: <20210809173955.1710866-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 25 +------------------------ arch/x86/kvm/svm/sev.c | 2 -- arch/x86/kvm/svm/svm.c | 2 -- arch/x86/kvm/vmx/vmx_ops.h | 2 -- arch/x86/kvm/x86.c | 9 ++++++++- 5 files changed, 9 insertions(+), 31 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5e35334980a6..8c8aa8ab5f0b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1811,30 +1811,7 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -asmlinkage void kvm_spurious_fault(void); - -/* - * Hardware virtualization extension instructions may fault if a - * reboot turns off virtualization while processes are running. - * Usually after catching the fault we just panic; during reboot - * instead the instruction is ignored. - */ -#define __kvm_handle_fault_on_reboot(insn) \ - "666: \n\t" \ - insn "\n\t" \ - "jmp 668f \n\t" \ - "667: \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_begin \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t" \ - "call kvm_spurious_fault \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_end \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t" \ - "668: \n\t" \ - _ASM_EXTABLE(666b, 667b) +void kvm_spurious_fault(void); #define KVM_ARCH_WANT_MMU_NOTIFIER diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 81601c31f549..75e0b21ad07c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -28,8 +28,6 @@ #include "cpuid.h" #include "trace.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - #ifndef CONFIG_KVM_AMD_SEV /* * When this config is not defined, SEV feature is not supported and APIs in diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9d72b1df426e..2b6632d4c76f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -46,8 +46,6 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 164b64f65a8f..c0d74b994b56 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -10,8 +10,6 @@ #include "evmcs.h" #include "vmcs.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - asmlinkage void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, bool fault); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70d42b50199a..58a72c7d3330 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -486,7 +486,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible noinstr void kvm_spurious_fault(void) +/* + * Handle a fault on a hardware virtualization (VMX or SVM) instruction. + * + * Hardware virtualization extension instructions may fault if a reboot turns + * off virtualization while processes are running. Usually after catching the + * fault we just panic; during reboot instead the instruction is ignored. + */ +noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); -- cgit v1.2.3 From 65297341d8e15b04cc9e206597a3d7c407c346f6 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 9 Aug 2021 10:39:55 -0700 Subject: KVM: x86: Move declaration of kvm_spurious_fault() to x86.h Move the declaration of kvm_spurious_fault() to KVM's "private" x86.h, it should never be called by anything other than low level KVM code. Cc: Paolo Bonzini Cc: Sean Christopherson Signed-off-by: Uros Bizjak [sean: rebased to a series without __ex()/__kvm_handle_fault_on_reboot()] Signed-off-by: Sean Christopherson Message-Id: <20210809173955.1710866-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/svm/svm_ops.h | 2 +- arch/x86/kvm/vmx/vmx_ops.h | 2 +- arch/x86/kvm/x86.h | 2 ++ 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8c8aa8ab5f0b..a612d213be5c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1811,8 +1811,6 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -void kvm_spurious_fault(void); - #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 8170f2a5a16f..22e2b019de37 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -4,7 +4,7 @@ #include -#include +#include "x86.h" #define svm_asm(insn, clobber...) \ do { \ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index c0d74b994b56..9e9ef47e988c 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -4,11 +4,11 @@ #include -#include #include #include "evmcs.h" #include "vmcs.h" +#include "x86.h" asmlinkage void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 44ae10312740..7d66d63dc55a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,8 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +void kvm_spurious_fault(void); + static __always_inline void kvm_guest_enter_irqoff(void) { /* -- cgit v1.2.3 From c1a527a1de46ad6f0f9d5907b29fc98e50267f8e Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 9 Aug 2021 17:34:08 +0800 Subject: KVM: x86: Clean up redundant ROL16(val, n) macro definition The ROL16(val, n) macro is repeatedly defined in several vmcs-related files, and it has never been used outside the KVM context. Let's move it to vmcs.h without any intended functional changes. Signed-off-by: Like Xu Message-Id: <20210809093410.59304-4-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 1 - arch/x86/kvm/vmx/evmcs.h | 4 ---- arch/x86/kvm/vmx/vmcs.h | 2 ++ arch/x86/kvm/vmx/vmcs12.c | 1 - arch/x86/kvm/vmx/vmcs12.h | 4 ---- 5 files changed, 2 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 896b2a50b4aa..0dab1b7b529f 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -14,7 +14,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); #if IS_ENABLED(CONFIG_HYPERV) -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ {EVMCS1_OFFSET(name), clean_field} diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 2ec9b46f0d0c..152ab0aa82cf 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -73,8 +73,6 @@ struct evmcs_field { extern const struct evmcs_field vmcs_field_to_evmcs_1[]; extern const unsigned int nr_evmcs_1_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static __always_inline int get_evmcs_offset(unsigned long field, u16 *clean_field) { @@ -95,8 +93,6 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -#undef ROL16 - static inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 4b9957e2bf5b..6e5de2e2b0da 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -11,6 +11,8 @@ #include "capabilities.h" +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) + struct vmcs_hdr { u32 revision_id:31; u32 shadow_vmcs:1; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index d9f5d7c56ae3..cab6ba7a5005 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -2,7 +2,6 @@ #include "vmcs12.h" -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) #define FIELD64(number, name) \ diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 5e0e1b39f495..2a45f026ee11 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -364,8 +364,6 @@ static inline void vmx_check_vmcs12_offsets(void) extern const unsigned short vmcs_field_to_offset_table[]; extern const unsigned int nr_vmcs12_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static inline short vmcs_field_to_offset(unsigned long field) { unsigned short offset; @@ -385,8 +383,6 @@ static inline short vmcs_field_to_offset(unsigned long field) return offset; } -#undef ROL16 - static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u16 offset) { -- cgit v1.2.3 From f7782bb8d818d8f47c26b22079db10599922787a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Aug 2021 07:45:26 -0700 Subject: KVM: nVMX: Unconditionally clear nested.pi_pending on nested VM-Enter Clear nested.pi_pending on nested VM-Enter even if L2 will run without posted interrupts enabled. If nested.pi_pending is left set from a previous L2, vmx_complete_nested_posted_interrupt() will pick up the stale flag and exit to userspace with an "internal emulation error" due the new L2 not having a valid nested.pi_desc. Arguably, vmx_complete_nested_posted_interrupt() should first check for posted interrupts being enabled, but it's also completely reasonable that KVM wouldn't screw up a fundamental flag. Not to mention that the mere existence of nested.pi_pending is a long-standing bug as KVM shouldn't move the posted interrupt out of the IRR until it's actually processed, e.g. KVM effectively drops an interrupt when it performs a nested VM-Exit with a "pending" posted interrupt. Fixing the mess is a future problem. Prior to vmx_complete_nested_posted_interrupt() interpreting a null PI descriptor as an error, this was a benign bug as the null PI descriptor effectively served as a check on PI not being enabled. Even then, the new flow did not become problematic until KVM started checking the result of kvm_check_nested_events(). Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing") Fixes: 966eefb89657 ("KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable") Fixes: 47d3530f86c0 ("KVM: x86: Exit to userspace when kvm_check_nested_events fails") Cc: stable@vger.kernel.org Cc: Jim Mattson Signed-off-by: Sean Christopherson Message-Id: <20210810144526.2662272-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 264a9f4c9179..bc6327950657 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2187,12 +2187,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { + vmx->nested.pi_pending = false; + if (nested_cpu_has_posted_intr(vmcs12)) vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - } else { + else exec_control &= ~PIN_BASED_POSTED_INTR; - } pin_controls_set(vmx, exec_control); /* -- cgit v1.2.3 From 504c6295b998effa682089747a96d7bb5933d4db Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 12 Aug 2021 10:39:50 +0530 Subject: arm64/mm: Add remaining ID_AA64MMFR0_PARANGE_ macros Currently there are macros only for 48 and 52 bits parange value extracted from the ID_AA64MMFR0.PARANGE field. This change completes the enumeration and updates the helper id_aa64mmfr0_parange_to_phys_shift(). While here it also defines ARM64_MIN_PARANGE_BITS as the absolute minimum shift value PA range which could be supported on a given platform. Cc: Marc Zyngier Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: kvmarm@lists.cs.columbia.edu Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1628744994-16623-2-git-send-email-anshuman.khandual@arm.com --- arch/arm64/include/asm/cpufeature.h | 14 +++++++------- arch/arm64/include/asm/sysreg.h | 7 +++++++ 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 9bb9d11750d7..8633bdb21f33 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -781,13 +781,13 @@ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { switch (parange) { - case 0: return 32; - case 1: return 36; - case 2: return 40; - case 3: return 42; - case 4: return 44; - case 5: return 48; - case 6: return 52; + case ID_AA64MMFR0_PARANGE_32: return 32; + case ID_AA64MMFR0_PARANGE_36: return 36; + case ID_AA64MMFR0_PARANGE_40: return 40; + case ID_AA64MMFR0_PARANGE_42: return 42; + case ID_AA64MMFR0_PARANGE_44: return 44; + case ID_AA64MMFR0_PARANGE_48: return 48; + case ID_AA64MMFR0_PARANGE_52: return 52; /* * A future PE could use a value unknown to the kernel. * However, by the "D10.1.4 Principles of the ID scheme diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 943d31d92b5b..1972e4b9be5c 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -853,9 +853,16 @@ #define ID_AA64MMFR0_TGRAN64_SUPPORTED 0x0 #define ID_AA64MMFR0_TGRAN16_NI 0x0 #define ID_AA64MMFR0_TGRAN16_SUPPORTED 0x1 +#define ID_AA64MMFR0_PARANGE_32 0x0 +#define ID_AA64MMFR0_PARANGE_36 0x1 +#define ID_AA64MMFR0_PARANGE_40 0x2 +#define ID_AA64MMFR0_PARANGE_42 0x3 +#define ID_AA64MMFR0_PARANGE_44 0x4 #define ID_AA64MMFR0_PARANGE_48 0x5 #define ID_AA64MMFR0_PARANGE_52 0x6 +#define ARM64_MIN_PARANGE_BITS 32 + #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0 #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE 0x1 #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN 0x2 -- cgit v1.2.3 From 9788c14060f3c179c376b2a87af1a430d4d84973 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 12 Aug 2021 10:39:51 +0530 Subject: KVM: arm64: Use ARM64_MIN_PARANGE_BITS as the minimum supported IPA Drop hard coded value for the minimum supported IPA range bits (i.e 32). Instead use ARM64_MIN_PARANGE_BITS which improves the code readability. Cc: Marc Zyngier Cc: James Morse Cc: Alexandru Elisei Cc: Suzuki K Poulose Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: kvmarm@lists.cs.columbia.edu Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1628744994-16623-3-git-send-email-anshuman.khandual@arm.com --- arch/arm64/kvm/reset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 18ffc6ad67b8..08fd487d5f95 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -364,7 +364,7 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); if (phys_shift) { if (phys_shift > kvm_ipa_limit || - phys_shift < 32) + phys_shift < ARM64_MIN_PARANGE_BITS) return -EINVAL; } else { phys_shift = KVM_PHYS_SHIFT; -- cgit v1.2.3 From bf249d9e362f1011a839d57e771b4b1a7eed9656 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 12 Aug 2021 10:39:52 +0530 Subject: KVM: arm64: Drop init_common_resources() Could do without this additional indirection via init_common_resources() by just calling kvm_set_ipa_limit() directly instead. Cc: Marc Zyngier Cc: James Morse Cc: Alexandru Elisei Cc: Suzuki K Poulose Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: kvmarm@lists.cs.columbia.edu Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1628744994-16623-4-git-send-email-anshuman.khandual@arm.com --- arch/arm64/kvm/arm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e9a2b8f27792..19560e457c11 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1696,11 +1696,6 @@ static bool init_psci_relay(void) return true; } -static int init_common_resources(void) -{ - return kvm_set_ipa_limit(); -} - static int init_subsystems(void) { int err = 0; @@ -2102,7 +2097,7 @@ int kvm_arch_init(void *opaque) } } - err = init_common_resources(); + err = kvm_set_ipa_limit(); if (err) return err; -- cgit v1.2.3 From 6b7982fefc1fdcaa31b712f5fbc2e993cc99ad23 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 12 Aug 2021 10:39:53 +0530 Subject: KVM: arm64: Drop check_kvm_target_cpu() based percpu probe kvm_target_cpu() never returns a negative error code, so check_kvm_target() would never have 'ret' filled with a negative error code. Hence the percpu probe via check_kvm_target_cpu() does not make sense as its never going to find an unsupported CPU, forcing kvm_arch_init() to exit early. Hence lets just drop this percpu probe (and also check_kvm_target_cpu()) altogether. While here, this also changes kvm_target_cpu() return type to a u32, making it explicit that an error code will not be returned from this function. Cc: Marc Zyngier Cc: James Morse Cc: Alexandru Elisei Cc: Suzuki K Poulose Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: kvmarm@lists.cs.columbia.edu Cc: linux-kernel@vger.kernel.org Acked-by: Will Deacon Signed-off-by: Anshuman Khandual Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1628744994-16623-5-git-send-email-anshuman.khandual@arm.com --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/arm.c | 16 +--------------- arch/arm64/kvm/guest.c | 4 ++-- 3 files changed, 4 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 41911585ae0c..26bbd84a02de 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -66,7 +66,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int kvm_sve_max_vl; int kvm_arm_init_sve(void); -int __attribute_const__ kvm_target_cpu(void); +u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 19560e457c11..ee9b1166f330 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1035,7 +1035,7 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { unsigned int i, ret; - int phys_target = kvm_target_cpu(); + u32 phys_target = kvm_target_cpu(); if (init->target != phys_target) return -EINVAL; @@ -2010,11 +2010,6 @@ static int finalize_hyp_mode(void) return 0; } -static void check_kvm_target_cpu(void *ret) -{ - *(int *)ret = kvm_target_cpu(); -} - struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { struct kvm_vcpu *vcpu; @@ -2074,7 +2069,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) int kvm_arch_init(void *opaque) { int err; - int ret, cpu; bool in_hyp_mode; if (!is_hyp_mode_available()) { @@ -2089,14 +2083,6 @@ int kvm_arch_init(void *opaque) kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ "Only trusted guests should be used on this system.\n"); - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); - if (ret < 0) { - kvm_err("Error, CPU %d not supported!\n", cpu); - return -ENODEV; - } - } - err = kvm_set_ipa_limit(); if (err) return err; diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 1dfb83578277..8ce850fa7b49 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -842,7 +842,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, return 0; } -int __attribute_const__ kvm_target_cpu(void) +u32 __attribute_const__ kvm_target_cpu(void) { unsigned long implementor = read_cpuid_implementor(); unsigned long part_number = read_cpuid_part_number(); @@ -874,7 +874,7 @@ int __attribute_const__ kvm_target_cpu(void) int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) { - int target = kvm_target_cpu(); + u32 target = kvm_target_cpu(); if (target < 0) return -ENODEV; -- cgit v1.2.3 From 9329752bc8659e3934e2b13434b2fddb0df0bb13 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 12 Aug 2021 10:39:54 +0530 Subject: KVM: arm64: Drop unused REQUIRES_VIRT This seems like a residue from the past. REQUIRES_VIRT is no more available . Hence it can just be dropped along with the related code section. Cc: Marc Zyngier Cc: James Morse Cc: Alexandru Elisei Cc: Suzuki K Poulose Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: kvmarm@lists.cs.columbia.edu Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1628744994-16623-6-git-send-email-anshuman.khandual@arm.com --- arch/arm64/kvm/arm.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ee9b1166f330..d779a29c5607 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -42,10 +42,6 @@ #include #include -#ifdef REQUIRES_VIRT -__asm__(".arch_extension virt"); -#endif - static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); -- cgit v1.2.3 From ccac96977243d7916053550f62e6489760ad0adc Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Aug 2021 14:03:36 +0100 Subject: KVM: arm64: Make hyp_panic() more robust when protected mode is enabled When protected mode is enabled, the host is unable to access most parts of the EL2 hypervisor image, including 'hyp_physvirt_offset' and the contents of the hypervisor's '.rodata.str' section. Unfortunately, nvhe_hyp_panic_handler() tries to read from both of these locations when handling a BUG() triggered at EL2; the former for converting the ELR to a physical address and the latter for displaying the name of the source file where the BUG() occurred. Hack the EL2 panic asm to pass both physical and virtual ELR values to the host and utilise the newly introduced CONFIG_NVHE_EL2_DEBUG so that we disable stage-2 protection for the host before returning to the EL1 panic handler. If the debug option is not enabled, display the address instead of the source file:line information. Cc: Andrew Scull Cc: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210813130336.8139-1-will@kernel.org --- arch/arm64/kvm/handle_exit.c | 23 ++++++++++++++--------- arch/arm64/kvm/hyp/nvhe/host.S | 21 +++++++++++++++++---- 2 files changed, 31 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 6f48336b1d86..04ebab299aa4 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -292,11 +292,12 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } -void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, +void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, + u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, u64 far, u64 hpfar) { - u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr)); - u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr; + u64 elr_in_kimg = __phys_to_kimg(elr_phys); + u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt; u64 mode = spsr & PSR_MODE_MASK; /* @@ -309,20 +310,24 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) { - struct bug_entry *bug = find_bug(elr_in_kimg); const char *file = NULL; unsigned int line = 0; /* All hyp bugs, including warnings, are treated as fatal. */ - if (bug) - bug_get_file_line(bug, &file, &line); + if (!is_protected_kvm_enabled() || + IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + struct bug_entry *bug = find_bug(elr_in_kimg); + + if (bug) + bug_get_file_line(bug, &file, &line); + } if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else - kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset); + kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset); } else { - kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset); + kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset); } /* @@ -334,5 +339,5 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", - spsr, elr, esr, far, hpfar, par, vcpu); + spsr, elr_virt, esr, far, hpfar, par, vcpu); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 2b23400e0fb3..4b652ffb591d 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -85,12 +86,24 @@ SYM_FUNC_START(__hyp_do_panic) mov x29, x0 +#ifdef CONFIG_NVHE_EL2_DEBUG + /* Ensure host stage-2 is disabled */ + mrs x0, hcr_el2 + bic x0, x0, #HCR_VM + msr hcr_el2, x0 + isb + tlbi vmalls12e1 + dsb nsh +#endif + /* Load the panic arguments into x0-7 */ mrs x0, esr_el2 - get_vcpu_ptr x4, x5 - mrs x5, far_el2 - mrs x6, hpfar_el2 - mov x7, xzr // Unused argument + mov x4, x3 + mov x3, x2 + hyp_pa x3, x6 + get_vcpu_ptr x5, x6 + mrs x6, far_el2 + mrs x7, hpfar_el2 /* Enter the host, conditionally restoring the host context. */ cbz x29, __host_enter_without_restoring -- cgit v1.2.3 From 6654f9dfcb88fea3b9affc180dc3c04333d0f306 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 18 Aug 2021 20:21:30 +0000 Subject: KVM: arm64: Fix read-side race on updates to vcpu reset state KVM correctly serializes writes to a vCPU's reset state, however since we do not take the KVM lock on the read side it is entirely possible to read state from two different reset requests. Cure the race for now by taking the KVM lock when reading the reset_state structure. Fixes: 358b28f09f0a ("arm/arm64: KVM: Allow a VCPU to fully reset itself") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210818202133.1106786-2-oupton@google.com --- arch/arm64/kvm/reset.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index cba7872d69a8..d862441b03b1 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -210,10 +210,16 @@ static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu) */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { + struct vcpu_reset_state reset_state; int ret; bool loaded; u32 pstate; + mutex_lock(&vcpu->kvm->lock); + reset_state = vcpu->arch.reset_state; + WRITE_ONCE(vcpu->arch.reset_state.reset, false); + mutex_unlock(&vcpu->kvm->lock); + /* Reset PMU outside of the non-preemptible section */ kvm_pmu_vcpu_reset(vcpu); @@ -276,8 +282,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) * Additional reset state handling that PSCI may have imposed on us. * Must be done after all the sys_reg reset. */ - if (vcpu->arch.reset_state.reset) { - unsigned long target_pc = vcpu->arch.reset_state.pc; + if (reset_state.reset) { + unsigned long target_pc = reset_state.pc; /* Gracefully handle Thumb2 entry point */ if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { @@ -286,13 +292,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } /* Propagate caller endianness */ - if (vcpu->arch.reset_state.be) + if (reset_state.be) kvm_vcpu_set_be(vcpu); *vcpu_pc(vcpu) = target_pc; - vcpu_set_reg(vcpu, 0, vcpu->arch.reset_state.r0); - - vcpu->arch.reset_state.reset = false; + vcpu_set_reg(vcpu, 0, reset_state.r0); } /* Reset timer */ -- cgit v1.2.3 From 6826c6849b46aaa91300201213701eb861af4ba0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 18 Aug 2021 20:21:31 +0000 Subject: KVM: arm64: Handle PSCI resets before userspace touches vCPU state The CPU_ON PSCI call takes a payload that KVM uses to configure a destination vCPU to run. This payload is non-architectural state and not exposed through any existing UAPI. Effectively, we have a race between CPU_ON and userspace saving/restoring a guest: if the target vCPU isn't ran again before the VMM saves its state, the requested PC and context ID are lost. When restored, the target vCPU will be runnable and start executing at its old PC. We can avoid this race by making sure the reset payload is serviced before userspace can access a vCPU's state. Fixes: 358b28f09f0a ("arm/arm64: KVM: Allow a VCPU to fully reset itself") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210818202133.1106786-3-oupton@google.com --- arch/arm64/kvm/arm.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e9a2b8f27792..8ce9996bcbfd 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1215,6 +1215,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(®, argp, sizeof(reg))) break; + /* + * We could owe a reset due to PSCI. Handle the pending reset + * here to ensure userspace register accesses are ordered after + * the reset. + */ + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + if (ioctl == KVM_SET_ONE_REG) r = kvm_arm_set_reg(vcpu, ®); else -- cgit v1.2.3 From e10ecb4d6c0761ca545b3946df1707a41f9f845e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 18 Aug 2021 20:21:32 +0000 Subject: KVM: arm64: Enforce reserved bits for PSCI target affinities According to the PSCI specification, ARM DEN 0022D, 5.1.4 "CPU_ON", the CPU_ON function takes a target_cpu argument that is bit-compatible with the affinity fields in MPIDR_EL1. All other bits in the argument are RES0. Note that the same constraints apply to the target_affinity argument for the AFFINITY_INFO call. Enforce the spec by returning INVALID_PARAMS if a guest incorrectly sets a RES0 bit. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210818202133.1106786-4-oupton@google.com --- arch/arm64/kvm/psci.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index db4056ecccfd..74c47d420253 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -59,6 +59,12 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } +static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu, + unsigned long affinity) +{ + return !(affinity & ~MPIDR_HWID_BITMASK); +} + static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct vcpu_reset_state *reset_state; @@ -66,9 +72,9 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) struct kvm_vcpu *vcpu = NULL; unsigned long cpu_id; - cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; - if (vcpu_mode_is_32bit(source_vcpu)) - cpu_id &= ~((u32) 0); + cpu_id = smccc_get_arg1(source_vcpu); + if (!kvm_psci_valid_affinity(source_vcpu, cpu_id)) + return PSCI_RET_INVALID_PARAMS; vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); @@ -126,6 +132,9 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) target_affinity = smccc_get_arg1(vcpu); lowest_affinity_level = smccc_get_arg2(vcpu); + if (!kvm_psci_valid_affinity(vcpu, target_affinity)) + return PSCI_RET_INVALID_PARAMS; + /* Determine target affinity mask */ target_affinity_mask = psci_affinity_mask(lowest_affinity_level); if (!target_affinity_mask) -- cgit v1.2.3 From fe5161d2c39b8c2801f0e786631460c6e8a1cae4 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 2 Aug 2021 19:28:07 +0000 Subject: KVM: arm64: Record number of signal exits as a vCPU stat Most other architectures that implement KVM record a statistic indicating the number of times a vCPU has exited due to a pending signal. Add support for that stat to arm64. Reviewed-by: Jing Zhang Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210802192809.1851010-2-oupton@google.com --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/guest.c | 1 + 3 files changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 41911585ae0c..70e129f2b574 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -576,6 +576,7 @@ struct kvm_vcpu_stat { u64 wfi_exit_stat; u64 mmio_exit_user; u64 mmio_exit_kernel; + u64 signal_exits; u64 exits; }; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e9a2b8f27792..60d0a546d7fd 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -783,6 +783,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (signal_pending(current)) { ret = -EINTR; run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; } /* diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 1dfb83578277..853d1e8d2e73 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -50,6 +50,7 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, wfi_exit_stat), STATS_DESC_COUNTER(VCPU, mmio_exit_user), STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), + STATS_DESC_COUNTER(VCPU, signal_exits), STATS_DESC_COUNTER(VCPU, exits) }; static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == -- cgit v1.2.3 From 6caa5812e2d126a0aa8a17816c1ba6f0a0c2b309 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 2 Aug 2021 19:28:09 +0000 Subject: KVM: arm64: Use generic KVM xfer to guest work function Clean up handling of checks for pending work by switching to the generic infrastructure to do so. We pick up handling for TIF_NOTIFY_RESUME from this switch, meaning that task work will be correctly handled. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210802192809.1851010-4-oupton@google.com --- arch/arm64/kvm/Kconfig | 1 + arch/arm64/kvm/arm.c | 72 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a4eba0908bfa..8bc1fac5fa26 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -26,6 +26,7 @@ menuconfig KVM select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select KVM_XFER_TO_GUEST_WORK select SRCU select KVM_VFIO select HAVE_KVM_EVENTFD diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 60d0a546d7fd..8245efc6e88f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -714,6 +715,45 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) static_branch_unlikely(&arm64_mismatched_32bit_el0); } +/** + * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest + * @vcpu: The VCPU pointer + * @ret: Pointer to write optional return code + * + * Returns: true if the VCPU needs to return to a preemptible + interruptible + * and skip guest entry. + * + * This function disambiguates between two different types of exits: exits to a + * preemptible + interruptible kernel context and exits to userspace. For an + * exit to userspace, this function will write the return code to ret and return + * true. For an exit to preemptible + interruptible kernel context (i.e. check + * for pending work and re-enter), return true without writing to ret. + */ +static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) +{ + struct kvm_run *run = vcpu->run; + + /* + * If we're using a userspace irqchip, then check if we need + * to tell a userspace irqchip about timer or PMU level + * changes and if so, exit to userspace (the actual level + * state gets updated in kvm_timer_update_run and + * kvm_pmu_update_run below). + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { + *ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + return true; + } + } + + return kvm_request_pending(vcpu) || + need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || + xfer_to_guest_mode_work_pending(); +} + /** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer @@ -757,7 +797,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - cond_resched(); + ret = xfer_to_guest_mode_handle_work(vcpu); + if (!ret) + ret = 1; update_vmid(&vcpu->arch.hw_mmu->vmid); @@ -776,31 +818,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_vgic_flush_hwstate(vcpu); - /* - * Exit if we have a signal pending so that we can deliver the - * signal to user space. - */ - if (signal_pending(current)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - } - - /* - * If we're using a userspace irqchip, then check if we need - * to tell a userspace irqchip about timer or PMU level - * changes and if so, exit to userspace (the actual level - * state gets updated in kvm_timer_update_run and - * kvm_pmu_update_run below). - */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { - if (kvm_timer_should_notify_user(vcpu) || - kvm_pmu_should_notify_user(vcpu)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - } - /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. @@ -809,8 +826,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - if (ret <= 0 || need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || - kvm_request_pending(vcpu)) { + if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ kvm_pmu_sync_hwstate(vcpu); -- cgit v1.2.3 From b9a51949cebcd57bfb9385d9da62ace52564898c Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 18 Aug 2021 14:32:05 -0700 Subject: KVM: arm64: vgic: Drop WARN from vgic_get_irq vgic_get_irq(intid) is used all over the vgic code in order to get a reference to a struct irq. It warns whenever intid is not a valid number (like when it's a reserved IRQ number). The issue is that this warning can be triggered from userspace (e.g., KVM_IRQ_LINE for intid 1020). Drop the WARN call from vgic_get_irq. Signed-off-by: Ricardo Koller Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210818213205.598471-1-ricarkol@google.com --- arch/arm64/kvm/vgic/vgic.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 111bff47e471..81cec508d413 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -106,7 +106,6 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, if (intid >= VGIC_MIN_LPI) return vgic_get_lpi(kvm, intid); - WARN(1, "Looking up struct vgic_irq for reserved INTID"); return NULL; } -- cgit v1.2.3 From 3134cc8beb69d0db9de651081707c4651c011621 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 19 Aug 2021 19:03:05 +0100 Subject: KVM: arm64: vgic: Resample HW pending state on deactivation When a mapped level interrupt (a timer, for example) is deactivated by the guest, the corresponding host interrupt is equally deactivated. However, the fate of the pending state still needs to be dealt with in SW. This is specially true when the interrupt was in the active+pending state in the virtual distributor at the point where the guest was entered. On exit, the pending state is potentially stale (the guest may have put the interrupt in a non-pending state). If we don't do anything, the interrupt will be spuriously injected in the guest. Although this shouldn't have any ill effect (spurious interrupts are always possible), we can improve the emulation by detecting the deactivation-while-pending case and resample the interrupt. While we're at it, move the logic into a common helper that can be shared between the two GIC implementations. Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts") Reported-by: Raghavendra Rao Ananta Tested-by: Raghavendra Rao Ananta Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210819180305.1670525-1-maz@kernel.org --- arch/arm64/kvm/vgic/vgic-v2.c | 36 +++++------------------------------- arch/arm64/kvm/vgic/vgic-v3.c | 36 +++++------------------------------- arch/arm64/kvm/vgic/vgic.c | 38 ++++++++++++++++++++++++++++++++++++++ arch/arm64/kvm/vgic/vgic.h | 2 ++ 4 files changed, 50 insertions(+), 62 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 2c580204f1dc..95a18cec14a3 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) u32 val = cpuif->vgic_lr[lr]; u32 cpuid, intid = val & GICH_LR_VIRTUALID; struct vgic_irq *irq; + bool deactivated; /* Extract the source vCPU id from the LR */ cpuid = val & GICH_LR_PHYSID_CPUID; @@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit */ + /* Always preserve the active bit, note deactivation */ + deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); irq->active = !!(val & GICH_LR_ACTIVE_BIT); if (irq->active && vgic_irq_is_sgi(intid)) @@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) irq->pending_latch = false; - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - * - * Another case is when the interrupt requires a helping hand - * on deactivation (no HW deactivation, for example). - */ - if (vgic_irq_is_mapped_level(irq)) { - bool resample = false; - - if (val & GICH_LR_PENDING_BIT) { - irq->line_level = vgic_get_phys_line_level(irq); - resample = !irq->line_level; - } else if (vgic_irq_needs_resampling(irq) && - !(irq->active || irq->pending_latch)) { - resample = true; - } - - if (resample) - vgic_irq_set_phys_active(irq, false); - } + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 66004f61cd83..21a6207fb2ee 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) u32 intid, cpuid; struct vgic_irq *irq; bool is_v2_sgi = false; + bool deactivated; cpuid = val & GICH_LR_PHYSID_CPUID; cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; @@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit */ + /* Always preserve the active bit, note deactivation */ + deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); if (irq->active && is_v2_sgi) @@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) irq->pending_latch = false; - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - * - * Another case is when the interrupt requires a helping hand - * on deactivation (no HW deactivation, for example). - */ - if (vgic_irq_is_mapped_level(irq)) { - bool resample = false; - - if (val & ICH_LR_PENDING_BIT) { - irq->line_level = vgic_get_phys_line_level(irq); - resample = !irq->line_level; - } else if (vgic_irq_needs_resampling(irq) && - !(irq->active || irq->pending_latch)) { - resample = true; - } - - if (resample) - vgic_irq_set_phys_active(irq, false); - } + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 81cec508d413..5dad4996cfb2 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -1021,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) return map_is_active; } + +/* + * Level-triggered mapped IRQs are special because we only observe rising + * edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample the physical + * line and set the line level, because the device state could have changed + * or we simply need to process the still pending interrupt later. + * + * We could also have entered the guest with the interrupt active+pending. + * On the next exit, we need to re-evaluate the pending state, as it could + * otherwise result in a spurious interrupt by injecting a now potentially + * stale pending state. + * + * If this causes us to lower the level, we have to also clear the physical + * active state, since we will otherwise never be told when the interrupt + * becomes asserted again. + * + * Another case is when the interrupt requires a helping hand on + * deactivation (no HW deactivation, for example). + */ +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending) +{ + if (vgic_irq_is_mapped_level(irq)) { + bool resample = false; + + if (unlikely(vgic_irq_needs_resampling(irq))) { + resample = !(irq->active || irq->pending_latch); + } else if (lr_pending || (lr_deactivated && irq->line_level)) { + irq->line_level = vgic_get_phys_line_level(irq); + resample = !irq->line_level; + } + + if (resample) + vgic_irq_set_phys_active(irq, false); + } +} diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index dc1f3d1657ee..14a9218641f5 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags); void vgic_kick_vcpus(struct kvm *kvm); +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending); int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, phys_addr_t addr, phys_addr_t alignment); -- cgit v1.2.3 From 923a547d71b967c808a596968cf8022102f8b5b2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 6 Aug 2021 12:31:06 +0100 Subject: KVM: arm64: Move kern_hyp_va() usage in __load_guest_stage2() into the callers It is a bit awkward to use kern_hyp_va() in __load_guest_stage2(), specially as the helper is shared between VHE and nVHE. Instead, move the use of kern_hyp_va() in the nVHE code, and pass a pointer to the kvm->arch structure instead. Although this may look a bit awkward, it allows for some further simplification. Cc: Catalin Marinas Cc: Jade Alglave Cc: Shameer Kolothum Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20210806113109.2475-4-will@kernel.org --- arch/arm64/include/asm/kvm_mmu.h | 5 +++-- arch/arm64/kvm/hyp/nvhe/switch.c | 4 +++- arch/arm64/kvm/hyp/nvhe/tlb.c | 2 +- arch/arm64/kvm/hyp/vhe/switch.c | 2 +- arch/arm64/kvm/hyp/vhe/tlb.c | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b52c5c4b9a3d..05e089653a1a 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -280,9 +280,10 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); } -static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) +static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu, + struct kvm_arch *arch) { - __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr); + __load_stage2(mmu, arch->vtcr); } static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f7af9688c1f7..e50a49082923 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -170,6 +170,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; + struct kvm_s2_mmu *mmu; bool pmu_switch_needed; u64 exit_code; @@ -213,7 +214,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_state_nvhe(guest_ctxt); - __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu)); + mmu = kern_hyp_va(vcpu->arch.hw_mmu); + __load_guest_stage2(mmu, kern_hyp_va(mmu->arch)); __activate_traps(vcpu); __hyp_vgic_restore_state(vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 38ed0f6f2703..76229407d8f0 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -39,7 +39,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * ensuring that we always have an ISB, but not two ISBs back * to back. */ - __load_guest_stage2(mmu); + __load_guest_stage2(mmu, kern_hyp_va(mmu->arch)); asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b3229924d243..0cb7523a501a 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -128,7 +128,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) * __load_guest_stage2 configures stage 2 translation, and * __activate_traps clear HCR_EL2.TGE (among other things). */ - __load_guest_stage2(vcpu->arch.hw_mmu); + __load_guest_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); __activate_traps(vcpu); __kvm_adjust_pc(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 66f17349f0c3..5e9fb3989e0b 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -53,7 +53,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * place before clearing TGE. __load_guest_stage2() already * has an ISB in order to deal with this. */ - __load_guest_stage2(mmu); + __load_guest_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg(val, hcr_el2); -- cgit v1.2.3 From 4efc0ede4f31d7ec25c3dee0c8f07f93735cee6d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 6 Aug 2021 12:31:07 +0100 Subject: KVM: arm64: Unify stage-2 programming behind __load_stage2() The protected mode relies on a separate helper to load the S2 context. Move over to the __load_guest_stage2() helper instead, and rename it to __load_stage2() to present a unified interface. Cc: Catalin Marinas Cc: Jade Alglave Cc: Shameer Kolothum Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20210806113109.2475-5-will@kernel.org --- arch/arm64/include/asm/kvm_mmu.h | 11 +++-------- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 +- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +- arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- arch/arm64/kvm/hyp/nvhe/tlb.c | 4 ++-- arch/arm64/kvm/hyp/vhe/switch.c | 6 +++--- arch/arm64/kvm/hyp/vhe/tlb.c | 4 ++-- 7 files changed, 13 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 05e089653a1a..08bc81f6944b 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -267,9 +267,10 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) * Must be called from hyp code running at EL2 with an updated VTTBR * and interrupts disabled. */ -static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr) +static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, + struct kvm_arch *arch) { - write_sysreg(vtcr, vtcr_el2); + write_sysreg(arch->vtcr, vtcr_el2); write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); /* @@ -280,12 +281,6 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); } -static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu, - struct kvm_arch *arch) -{ - __load_stage2(mmu, arch->vtcr); -} - static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) { return container_of(mmu->arch, struct kvm, arch); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 9c227d87c36d..8901dc95d7de 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -29,7 +29,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); static __always_inline void __load_host_stage2(void) { if (static_branch_likely(&kvm_protected_mode_initialized)) - __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); else write_sysreg(0, vttbr_el2); } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index d938ce95d3bd..36aea13c9e5a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -126,7 +126,7 @@ int __pkvm_prot_finalize(void) kvm_flush_dcache_to_poc(params, sizeof(*params)); write_sysreg(params->hcr_el2, hcr_el2); - __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); /* * Make sure to have an ISB before the TLB maintenance below but only diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index e50a49082923..3e7ad32b3f0d 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -215,7 +215,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg_restore_state_nvhe(guest_ctxt); mmu = kern_hyp_va(vcpu->arch.hw_mmu); - __load_guest_stage2(mmu, kern_hyp_va(mmu->arch)); + __load_stage2(mmu, kern_hyp_va(mmu->arch)); __activate_traps(vcpu); __hyp_vgic_restore_state(vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 76229407d8f0..d296d617f589 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -34,12 +34,12 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, } /* - * __load_guest_stage2() includes an ISB only when the AT + * __load_stage2() includes an ISB only when the AT * workaround is applied. Take care of the opposite condition, * ensuring that we always have an ISB, but not two ISBs back * to back. */ - __load_guest_stage2(mmu, kern_hyp_va(mmu->arch)); + __load_stage2(mmu, kern_hyp_va(mmu->arch)); asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 0cb7523a501a..709f6438283e 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -124,11 +124,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) * * We have already configured the guest's stage 1 translation in * kvm_vcpu_load_sysregs_vhe above. We must now call - * __load_guest_stage2 before __activate_traps, because - * __load_guest_stage2 configures stage 2 translation, and + * __load_stage2 before __activate_traps, because + * __load_stage2 configures stage 2 translation, and * __activate_traps clear HCR_EL2.TGE (among other things). */ - __load_guest_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); + __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); __activate_traps(vcpu); __kvm_adjust_pc(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 5e9fb3989e0b..24cef9b87f9e 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -50,10 +50,10 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * * ARM erratum 1165522 requires some special handling (again), * as we need to make sure both stages of translation are in - * place before clearing TGE. __load_guest_stage2() already + * place before clearing TGE. __load_stage2() already * has an ISB in order to deal with this. */ - __load_guest_stage2(mmu, mmu->arch); + __load_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg(val, hcr_el2); -- cgit v1.2.3 From cf364e08ea1c5dd217afb658d510aaef7d0cc6f4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 6 Aug 2021 12:31:08 +0100 Subject: KVM: arm64: Upgrade VMID accesses to {READ,WRITE}_ONCE Since TLB invalidation can run in parallel with VMID allocation, we need to be careful and avoid any sort of load/store tearing. Use {READ,WRITE}_ONCE consistently to avoid any surprise. Cc: Catalin Marinas Cc: Jade Alglave Cc: Shameer Kolothum Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Reviewed-by: Quentin Perret Link: https://lore.kernel.org/r/20210806113109.2475-6-will@kernel.org --- arch/arm64/include/asm/kvm_mmu.h | 7 ++++++- arch/arm64/kvm/arm.c | 2 +- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 ++-- arch/arm64/kvm/mmu.c | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 08bc81f6944b..02d378887743 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -252,6 +252,11 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) +/* + * When this is (directly or indirectly) used on the TLB invalidation + * path, we rely on a previously issued DSB so that page table updates + * and VMID reads are correctly ordered. + */ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) { struct kvm_vmid *vmid = &mmu->vmid; @@ -259,7 +264,7 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; baddr = mmu->pgd_phys; - vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + vmid_field = (u64)READ_ONCE(vmid->vmid) << VTTBR_VMID_SHIFT; return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e9a2b8f27792..658f76067f46 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -571,7 +571,7 @@ static void update_vmid(struct kvm_vmid *vmid) kvm_call_hyp(__kvm_flush_vm_context); } - vmid->vmid = kvm_next_vmid; + WRITE_ONCE(vmid->vmid, kvm_next_vmid); kvm_next_vmid++; kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 36aea13c9e5a..7a0c4af186a0 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -109,8 +109,8 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); mmu->arch = &host_kvm.arch; mmu->pgt = &host_kvm.pgt; - mmu->vmid.vmid_gen = 0; - mmu->vmid.vmid = 0; + WRITE_ONCE(mmu->vmid.vmid_gen, 0); + WRITE_ONCE(mmu->vmid.vmid, 0); return 0; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0625bf2353c2..d9152717cbd9 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -485,7 +485,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) mmu->arch = &kvm->arch; mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); - mmu->vmid.vmid_gen = 0; + WRITE_ONCE(mmu->vmid.vmid_gen, 0); return 0; out_destroy_pgtable: -- cgit v1.2.3 From 2ea7f655800b00b109951f22539fe2025add210b Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:20 +0100 Subject: KVM: arm64: placeholder to check if VM is protected Add a function to check whether a VM is protected (under pKVM). Since the creation of protected VMs isn't enabled yet, this is a placeholder that always returns false. The intention is for this to become a check for protected VMs in the future (see Will's RFC). No functional change intended. Acked-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/kvmarm/20210603183347.1695-1-will@kernel.org/ Link: https://lore.kernel.org/r/20210817081134.2918285-2-tabba@google.com --- arch/arm64/include/asm/kvm_host.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 41911585ae0c..347781f99b6a 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -771,6 +771,11 @@ void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); +static inline bool kvm_vm_is_protected(struct kvm *kvm) +{ + return false; +} + int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From e6bc555c96990046d680ff92c8e2e7b6b43b509f Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:21 +0100 Subject: KVM: arm64: Remove trailing whitespace in comment Remove trailing whitespace from comment in trap_dbgauthstatus_el1(). No functional change intended. Acked-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210817081134.2918285-3-tabba@google.com --- arch/arm64/kvm/sys_regs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f6f126eb6ac1..80a6e41cadad 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -318,14 +318,14 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, /* * We want to avoid world-switching all the DBG registers all the * time: - * + * * - If we've touched any debug register, it is likely that we're * going to touch more of them. It then makes sense to disable the * traps and start doing the save/restore dance * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is * then mandatory to save/restore the registers, as the guest * depends on them. - * + * * For this, we use a DIRTY bit, indicating the guest has modified the * debug registers, used as follow: * -- cgit v1.2.3 From d6c850dd6ce9ce4b410142a600d8c34dc041d860 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:22 +0100 Subject: KVM: arm64: MDCR_EL2 is a 64-bit register Fix the places in KVM that treat MDCR_EL2 as a 32-bit register. More recent features (e.g., FEAT_SPEv1p2) use bits above 31. No functional change intended. Acked-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210817081134.2918285-4-tabba@google.com --- arch/arm64/include/asm/kvm_arm.h | 20 ++++++++++---------- arch/arm64/include/asm/kvm_asm.h | 2 +- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/debug.c | 2 +- arch/arm64/kvm/hyp/nvhe/debug-sr.c | 2 +- arch/arm64/kvm/hyp/vhe/debug-sr.c | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index d436831dd706..6a523ec83415 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -281,18 +281,18 @@ /* Hyp Debug Configuration Register bits */ #define MDCR_EL2_E2TB_MASK (UL(0x3)) #define MDCR_EL2_E2TB_SHIFT (UL(24)) -#define MDCR_EL2_TTRF (1 << 19) -#define MDCR_EL2_TPMS (1 << 14) +#define MDCR_EL2_TTRF (UL(1) << 19) +#define MDCR_EL2_TPMS (UL(1) << 14) #define MDCR_EL2_E2PB_MASK (UL(0x3)) #define MDCR_EL2_E2PB_SHIFT (UL(12)) -#define MDCR_EL2_TDRA (1 << 11) -#define MDCR_EL2_TDOSA (1 << 10) -#define MDCR_EL2_TDA (1 << 9) -#define MDCR_EL2_TDE (1 << 8) -#define MDCR_EL2_HPME (1 << 7) -#define MDCR_EL2_TPM (1 << 6) -#define MDCR_EL2_TPMCR (1 << 5) -#define MDCR_EL2_HPMN_MASK (0x1F) +#define MDCR_EL2_TDRA (UL(1) << 11) +#define MDCR_EL2_TDOSA (UL(1) << 10) +#define MDCR_EL2_TDA (UL(1) << 9) +#define MDCR_EL2_TDE (UL(1) << 8) +#define MDCR_EL2_HPME (UL(1) << 7) +#define MDCR_EL2_TPM (UL(1) << 6) +#define MDCR_EL2_TPMCR (UL(1) << 5) +#define MDCR_EL2_HPMN_MASK (UL(0x1F)) /* For compatibility with fault code shared with 32-bit */ #define FSC_FAULT ESR_ELx_FSC_FAULT diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9f0bf2109be7..63ead9060ab5 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -210,7 +210,7 @@ extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); -extern u32 __kvm_get_mdcr_el2(void); +extern u64 __kvm_get_mdcr_el2(void); #define __KVM_EXTABLE(from, to) \ " .pushsection __kvm_ex_table, \"a\"\n" \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 347781f99b6a..4d2d974c1522 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -289,7 +289,7 @@ struct kvm_vcpu_arch { /* HYP configuration */ u64 hcr_el2; - u32 mdcr_el2; + u64 mdcr_el2; /* Exception Information */ struct kvm_vcpu_fault_info fault; diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index d5e79d7ee6e9..db9361338b2a 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -21,7 +21,7 @@ DBG_MDSCR_KDE | \ DBG_MDSCR_MDE) -static DEFINE_PER_CPU(u32, mdcr_el2); +static DEFINE_PER_CPU(u64, mdcr_el2); /** * save/restore_guest_debug_regs diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 7d3f25868cae..df361d839902 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -109,7 +109,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) __debug_switch_to_host_common(vcpu); } -u32 __kvm_get_mdcr_el2(void) +u64 __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c index f1e2e5a00933..289689b2682d 100644 --- a/arch/arm64/kvm/hyp/vhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -20,7 +20,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) __debug_switch_to_host_common(vcpu); } -u32 __kvm_get_mdcr_el2(void) +u64 __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } -- cgit v1.2.3 From dabb1667d8573302712a75530cccfee8f3ffff84 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:23 +0100 Subject: KVM: arm64: Fix names of config register fields Change the names of hcr_el2 register fields to match the Arm Architecture Reference Manual. Easier for cross-referencing and for grepping. Also, change the name of CPTR_EL2_RES1 to CPTR_NVHE_EL2_RES1, because res1 bits are different for VHE. No functional change intended. Acked-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210817081134.2918285-5-tabba@google.com --- arch/arm64/include/asm/kvm_arm.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 6a523ec83415..a928b2dc0b0f 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -32,9 +32,9 @@ #define HCR_TVM (UL(1) << 26) #define HCR_TTLB (UL(1) << 25) #define HCR_TPU (UL(1) << 24) -#define HCR_TPC (UL(1) << 23) +#define HCR_TPC (UL(1) << 23) /* HCR_TPCP if FEAT_DPB */ #define HCR_TSW (UL(1) << 22) -#define HCR_TAC (UL(1) << 21) +#define HCR_TACR (UL(1) << 21) #define HCR_TIDCP (UL(1) << 20) #define HCR_TSC (UL(1) << 19) #define HCR_TID3 (UL(1) << 18) @@ -61,7 +61,7 @@ * The bits we set in HCR: * TLOR: Trap LORegion register accesses * RW: 64bit by default, can be overridden for 32bit VMs - * TAC: Trap ACTLR + * TACR: Trap ACTLR * TSC: Trap SMC * TSW: Trap cache operations by set/way * TWE: Trap WFE @@ -76,7 +76,7 @@ * PTW: Take a stage2 fault if a stage1 walk steps in device memory */ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ - HCR_BSU_IS | HCR_FB | HCR_TAC | \ + HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ HCR_FMO | HCR_IMO | HCR_PTW ) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) @@ -275,8 +275,8 @@ #define CPTR_EL2_TTA (1 << 20) #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) #define CPTR_EL2_TZ (1 << 8) -#define CPTR_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 */ -#define CPTR_EL2_DEFAULT CPTR_EL2_RES1 +#define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ +#define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1 /* Hyp Debug Configuration Register bits */ #define MDCR_EL2_E2TB_MASK (UL(0x3)) -- cgit v1.2.3 From f76f89e2f73d93720cfcad7fb7b24d022b2846bf Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:24 +0100 Subject: KVM: arm64: Refactor sys_regs.h,c for nVHE reuse Refactor sys_regs.h and sys_regs.c to make it easier to reuse common code. It will be used in nVHE in a later patch. Note that the refactored code uses __inline_bsearch for find_reg instead of bsearch to avoid copying the bsearch code for nVHE. No functional change intended. Acked-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210817081134.2918285-6-tabba@google.com --- arch/arm64/include/asm/sysreg.h | 5 ++++ arch/arm64/kvm/sys_regs.c | 60 +++++++++++------------------------------ arch/arm64/kvm/sys_regs.h | 31 +++++++++++++++++++++ 3 files changed, 52 insertions(+), 44 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 7b9c3acba684..53a93a9c5253 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1153,6 +1153,11 @@ #define ICH_VTR_A3V_SHIFT 21 #define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) +#define ARM64_FEATURE_FIELD_BITS 4 + +/* Create a mask for the feature bits of the specified feature. */ +#define ARM64_FEATURE_MASK(x) (GENMASK_ULL(x##_SHIFT + ARM64_FEATURE_FIELD_BITS - 1, x##_SHIFT)) + #ifdef __ASSEMBLY__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 80a6e41cadad..b6a2f8e890db 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -44,10 +44,6 @@ * 64bit interface. */ -#define reg_to_encoding(x) \ - sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ - (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) - static bool read_from_write_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) @@ -1026,8 +1022,6 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } -#define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT)) - /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r, bool raz) @@ -1038,40 +1032,40 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, switch (id) { case SYS_ID_AA64PFR0_EL1: if (!vcpu_has_sve(vcpu)) - val &= ~FEATURE(ID_AA64PFR0_SVE); - val &= ~FEATURE(ID_AA64PFR0_AMU); - val &= ~FEATURE(ID_AA64PFR0_CSV2); - val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); - val &= ~FEATURE(ID_AA64PFR0_CSV3); - val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_AMU); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); break; case SYS_ID_AA64PFR1_EL1: - val &= ~FEATURE(ID_AA64PFR1_MTE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); if (kvm_has_mte(vcpu->kvm)) { u64 pfr, mte; pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT); - val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), mte); } break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) - val &= ~(FEATURE(ID_AA64ISAR1_APA) | - FEATURE(ID_AA64ISAR1_API) | - FEATURE(ID_AA64ISAR1_GPA) | - FEATURE(ID_AA64ISAR1_GPI)); + val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI)); break; case SYS_ID_AA64DFR0_EL1: /* Limit debug to ARMv8.0 */ - val &= ~FEATURE(ID_AA64DFR0_DEBUGVER); - val |= FIELD_PREP(FEATURE(ID_AA64DFR0_DEBUGVER), 6); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), 6); /* Limit guests to PMUv3 for ARMv8.4 */ val = cpuid_feature_cap_perfmon_field(val, ID_AA64DFR0_PMUVER_SHIFT, kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0); /* Hide SPE from guests */ - val &= ~FEATURE(ID_AA64DFR0_PMSVER); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER); break; case SYS_ID_DFR0_EL1: /* Limit guests to PMUv3 for ARMv8.4 */ @@ -2106,23 +2100,6 @@ static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, return 0; } -static int match_sys_reg(const void *key, const void *elt) -{ - const unsigned long pval = (unsigned long)key; - const struct sys_reg_desc *r = elt; - - return pval - reg_to_encoding(r); -} - -static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num) -{ - unsigned long pval = reg_to_encoding(params); - - return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); -} - int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); @@ -2365,13 +2342,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) trace_kvm_handle_sys_reg(esr); - params.Op0 = (esr >> 20) & 3; - params.Op1 = (esr >> 14) & 0x7; - params.CRn = (esr >> 10) & 0xf; - params.CRm = (esr >> 1) & 0xf; - params.Op2 = (esr >> 17) & 0x7; + params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); - params.is_write = !(esr & 1); ret = emulate_sys_reg(vcpu, ¶ms); diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 9d0621417c2a..cc0cc95a0280 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -11,6 +11,12 @@ #ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__ #define __ARM64_KVM_SYS_REGS_LOCAL_H__ +#include + +#define reg_to_encoding(x) \ + sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ + (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) + struct sys_reg_params { u8 Op0; u8 Op1; @@ -21,6 +27,14 @@ struct sys_reg_params { bool is_write; }; +#define esr_sys64_to_params(esr) \ + ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ + .Op1 = ((esr) >> 14) & 0x7, \ + .CRn = ((esr) >> 10) & 0xf, \ + .CRm = ((esr) >> 1) & 0xf, \ + .Op2 = ((esr) >> 17) & 0x7, \ + .is_write = !((esr) & 1) }) + struct sys_reg_desc { /* Sysreg string for debug */ const char *name; @@ -152,6 +166,23 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1, return i1->Op2 - i2->Op2; } +static inline int match_sys_reg(const void *key, const void *elt) +{ + const unsigned long pval = (unsigned long)key; + const struct sys_reg_desc *r = elt; + + return pval - reg_to_encoding(r); +} + +static inline const struct sys_reg_desc * +find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], + unsigned int num) +{ + unsigned long pval = reg_to_encoding(params); + + return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); +} + const struct sys_reg_desc *find_reg_by_id(u64 id, struct sys_reg_params *params, const struct sys_reg_desc table[], -- cgit v1.2.3 From 1460b4b25fde52cbee746c11a4b1d3185f2e2847 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:25 +0100 Subject: KVM: arm64: Restore mdcr_el2 from vcpu On deactivating traps, restore the value of mdcr_el2 from the newly created and preserved host value vcpu context, rather than directly reading the hardware register. Up until and including this patch the two values are the same, i.e., the hardware register and the vcpu one. A future patch will be changing the value of mdcr_el2 on activating traps, and this ensures that its value will be restored. No functional change intended. Signed-off-by: Fuad Tabba Acked-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210817081134.2918285-7-tabba@google.com --- arch/arm64/include/asm/kvm_host.h | 5 ++++- arch/arm64/include/asm/kvm_hyp.h | 2 +- arch/arm64/kvm/hyp/include/hyp/switch.h | 6 +++++- arch/arm64/kvm/hyp/nvhe/switch.c | 13 +++++-------- arch/arm64/kvm/hyp/vhe/switch.c | 14 +++++--------- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 2 +- 6 files changed, 21 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4d2d974c1522..76462c6a91ee 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -287,10 +287,13 @@ struct kvm_vcpu_arch { /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; - /* HYP configuration */ + /* Values of trap registers for the guest. */ u64 hcr_el2; u64 mdcr_el2; + /* Values of trap registers for the host before guest entry. */ + u64 mdcr_el2_host; + /* Exception Information */ struct kvm_vcpu_fault_info fault; diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 9d60b3006efc..657d0c94cf82 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -95,7 +95,7 @@ void __sve_restore_state(void *sve_pffr, u32 *fpsr); #ifndef __KVM_NVHE_HYPERVISOR__ void activate_traps_vhe_load(struct kvm_vcpu *vcpu); -void deactivate_traps_vhe_put(void); +void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu); #endif u64 __guest_enter(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index e4a2f295a394..a0e78a6027be 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -92,11 +92,15 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(0, pmselr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); } + + vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); } -static inline void __deactivate_traps_common(void) +static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { + write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2); + write_sysreg(0, hstr_el2); if (kvm_arm_support_pmu_v3()) write_sysreg(0, pmuserenr_el0); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f7af9688c1f7..2ea764a48958 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -69,12 +69,10 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { extern char __kvm_hyp_host_vector[]; - u64 mdcr_el2, cptr; + u64 cptr; ___deactivate_traps(vcpu); - mdcr_el2 = read_sysreg(mdcr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; @@ -92,13 +90,12 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } - __deactivate_traps_common(); + vcpu->arch.mdcr_el2_host &= MDCR_EL2_HPMN_MASK | + MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT | + MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; - mdcr_el2 &= MDCR_EL2_HPMN_MASK; - mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; - mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; + __deactivate_traps_common(vcpu); - write_sysreg(mdcr_el2, mdcr_el2); write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); cptr = CPTR_EL2_DEFAULT; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b3229924d243..ec158fa41ae6 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -91,17 +91,13 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu) __activate_traps_common(vcpu); } -void deactivate_traps_vhe_put(void) +void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) { - u64 mdcr_el2 = read_sysreg(mdcr_el2); + vcpu->arch.mdcr_el2_host &= MDCR_EL2_HPMN_MASK | + MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT | + MDCR_EL2_TPMS; - mdcr_el2 &= MDCR_EL2_HPMN_MASK | - MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT | - MDCR_EL2_TPMS; - - write_sysreg(mdcr_el2, mdcr_el2); - - __deactivate_traps_common(); + __deactivate_traps_common(vcpu); } /* Switch to the guest for VHE systems running in EL2 */ diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 2a0b8c88d74f..007a12dd4351 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -101,7 +101,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *host_ctxt; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; - deactivate_traps_vhe_put(); + deactivate_traps_vhe_put(vcpu); __sysreg_save_el1_state(guest_ctxt); __sysreg_save_user_state(guest_ctxt); -- cgit v1.2.3 From 12849badc6d2456f15f8f2c93037628d5176810b Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:26 +0100 Subject: KVM: arm64: Keep mdcr_el2's value as set by __init_el2_debug __init_el2_debug configures mdcr_el2 at initialization based on, among other things, available hardware support. Trap deactivation doesn't check that, so keep the initial value. No functional change intended. Signed-off-by: Fuad Tabba Acked-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210817081134.2918285-8-tabba@google.com --- arch/arm64/kvm/hyp/nvhe/switch.c | 4 ---- arch/arm64/kvm/hyp/vhe/switch.c | 4 ---- 2 files changed, 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 2ea764a48958..1778593a08a9 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -90,10 +90,6 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } - vcpu->arch.mdcr_el2_host &= MDCR_EL2_HPMN_MASK | - MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT | - MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; - __deactivate_traps_common(vcpu); write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index ec158fa41ae6..0d0c9550fb08 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -93,10 +93,6 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu) void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) { - vcpu->arch.mdcr_el2_host &= MDCR_EL2_HPMN_MASK | - MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT | - MDCR_EL2_TPMS; - __deactivate_traps_common(vcpu); } -- cgit v1.2.3 From cd496228fd8de2e82b6636d3d89105631ea2b69c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:27 +0100 Subject: KVM: arm64: Track value of cptr_el2 in struct kvm_vcpu_arch Track the baseline guest value for cptr_el2 in struct kvm_vcpu_arch, similar to the other registers that control traps. Use this value when setting cptr_el2 for the guest. Currently this value is unchanged (CPTR_EL2_DEFAULT), but future patches will set trapping bits based on features supported for the guest. No functional change intended. Acked-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210817081134.2918285-9-tabba@google.com --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 76462c6a91ee..ac67d5699c68 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -290,6 +290,7 @@ struct kvm_vcpu_arch { /* Values of trap registers for the guest. */ u64 hcr_el2; u64 mdcr_el2; + u64 cptr_el2; /* Values of trap registers for the host before guest entry. */ u64 mdcr_el2_host; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e9a2b8f27792..14b12f2c08c0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1104,6 +1104,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, } vcpu_reset_hcr(vcpu); + vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; /* * Handle the "start in power-off" case. diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 1778593a08a9..86f3d6482935 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -41,7 +41,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) ___activate_traps(vcpu); __activate_traps_common(vcpu); - val = CPTR_EL2_DEFAULT; + val = vcpu->arch.cptr_el2; val |= CPTR_EL2_TTA | CPTR_EL2_TAM; if (!update_fp_enabled(vcpu)) { val |= CPTR_EL2_TFP | CPTR_EL2_TZ; -- cgit v1.2.3 From 95b54c3e4c92b9185b15c83e8baab9ba312195f6 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:28 +0100 Subject: KVM: arm64: Add feature register flag definitions Add feature register flag definitions to clarify which features might be supported. Consolidate the various ID_AA64PFR0_ELx flags for all ELs. No functional change intended. Signed-off-by: Fuad Tabba Acked-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210817081134.2918285-10-tabba@google.com --- arch/arm64/include/asm/cpufeature.h | 4 ++-- arch/arm64/include/asm/sysreg.h | 12 ++++++++---- arch/arm64/kernel/cpufeature.c | 8 ++++---- 3 files changed, 14 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 9bb9d11750d7..b7d9bb17908d 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -602,14 +602,14 @@ static inline bool id_aa64pfr0_32bit_el1(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT); - return val == ID_AA64PFR0_EL1_32BIT_64BIT; + return val == ID_AA64PFR0_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT); - return val == ID_AA64PFR0_EL0_32BIT_64BIT; + return val == ID_AA64PFR0_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_sve(u64 pfr0) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 53a93a9c5253..f84a00f5874d 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -784,14 +784,13 @@ #define ID_AA64PFR0_AMU 0x1 #define ID_AA64PFR0_SVE 0x1 #define ID_AA64PFR0_RAS_V1 0x1 +#define ID_AA64PFR0_RAS_V1P1 0x2 #define ID_AA64PFR0_FP_NI 0xf #define ID_AA64PFR0_FP_SUPPORTED 0x0 #define ID_AA64PFR0_ASIMD_NI 0xf #define ID_AA64PFR0_ASIMD_SUPPORTED 0x0 -#define ID_AA64PFR0_EL1_64BIT_ONLY 0x1 -#define ID_AA64PFR0_EL1_32BIT_64BIT 0x2 -#define ID_AA64PFR0_EL0_64BIT_ONLY 0x1 -#define ID_AA64PFR0_EL0_32BIT_64BIT 0x2 +#define ID_AA64PFR0_ELx_64BIT_ONLY 0x1 +#define ID_AA64PFR0_ELx_32BIT_64BIT 0x2 /* id_aa64pfr1 */ #define ID_AA64PFR1_MPAMFRAC_SHIFT 16 @@ -847,12 +846,16 @@ #define ID_AA64MMFR0_ASID_SHIFT 4 #define ID_AA64MMFR0_PARANGE_SHIFT 0 +#define ID_AA64MMFR0_ASID_8 0x0 +#define ID_AA64MMFR0_ASID_16 0x2 + #define ID_AA64MMFR0_TGRAN4_NI 0xf #define ID_AA64MMFR0_TGRAN4_SUPPORTED 0x0 #define ID_AA64MMFR0_TGRAN64_NI 0xf #define ID_AA64MMFR0_TGRAN64_SUPPORTED 0x0 #define ID_AA64MMFR0_TGRAN16_NI 0x0 #define ID_AA64MMFR0_TGRAN16_SUPPORTED 0x1 +#define ID_AA64MMFR0_PARANGE_40 0x2 #define ID_AA64MMFR0_PARANGE_48 0x5 #define ID_AA64MMFR0_PARANGE_52 0x6 @@ -900,6 +903,7 @@ #define ID_AA64MMFR2_CNP_SHIFT 0 /* id_aa64dfr0 */ +#define ID_AA64DFR0_MTPMU_SHIFT 48 #define ID_AA64DFR0_TRBE_SHIFT 44 #define ID_AA64DFR0_TRACE_FILT_SHIFT 40 #define ID_AA64DFR0_DOUBLELOCK_SHIFT 36 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 0ead8bfedf20..5b59fe5e26e4 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -239,8 +239,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), ARM64_FTR_END, }; @@ -1956,7 +1956,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, .field_pos = ID_AA64PFR0_EL0_SHIFT, - .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, + .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, }, #ifdef CONFIG_KVM { @@ -1967,7 +1967,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, .field_pos = ID_AA64PFR0_EL1_SHIFT, - .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT, + .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, }, { .desc = "Protected KVM", -- cgit v1.2.3 From 2d701243b9f231b5d7f9a8cb81870650d3eb32bc Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 17 Aug 2021 09:11:29 +0100 Subject: KVM: arm64: Add config register bit definitions Add hardware configuration register bit definitions for HCR_EL2 and MDCR_EL2. Future patches toggle these hyp configuration register bits to trap on certain accesses. No functional change intended. Acked-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210817081134.2918285-11-tabba@google.com --- arch/arm64/include/asm/kvm_arm.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index a928b2dc0b0f..327120c0089f 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -12,8 +12,13 @@ #include /* Hyp Configuration Register (HCR) bits */ + +#define HCR_TID5 (UL(1) << 58) +#define HCR_DCT (UL(1) << 57) #define HCR_ATA_SHIFT 56 #define HCR_ATA (UL(1) << HCR_ATA_SHIFT) +#define HCR_AMVOFFEN (UL(1) << 51) +#define HCR_FIEN (UL(1) << 47) #define HCR_FWB (UL(1) << 46) #define HCR_API (UL(1) << 41) #define HCR_APK (UL(1) << 40) @@ -56,6 +61,7 @@ #define HCR_PTW (UL(1) << 2) #define HCR_SWIO (UL(1) << 1) #define HCR_VM (UL(1) << 0) +#define HCR_RES0 ((UL(1) << 48) | (UL(1) << 39)) /* * The bits we set in HCR: @@ -277,11 +283,21 @@ #define CPTR_EL2_TZ (1 << 8) #define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ #define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1 +#define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \ + GENMASK(29, 21) | \ + GENMASK(19, 14) | \ + BIT(11)) /* Hyp Debug Configuration Register bits */ #define MDCR_EL2_E2TB_MASK (UL(0x3)) #define MDCR_EL2_E2TB_SHIFT (UL(24)) +#define MDCR_EL2_HPMFZS (UL(1) << 36) +#define MDCR_EL2_HPMFZO (UL(1) << 29) +#define MDCR_EL2_MTPME (UL(1) << 28) +#define MDCR_EL2_TDCC (UL(1) << 27) +#define MDCR_EL2_HCCD (UL(1) << 23) #define MDCR_EL2_TTRF (UL(1) << 19) +#define MDCR_EL2_HPMD (UL(1) << 17) #define MDCR_EL2_TPMS (UL(1) << 14) #define MDCR_EL2_E2PB_MASK (UL(0x3)) #define MDCR_EL2_E2PB_SHIFT (UL(12)) @@ -293,6 +309,12 @@ #define MDCR_EL2_TPM (UL(1) << 6) #define MDCR_EL2_TPMCR (UL(1) << 5) #define MDCR_EL2_HPMN_MASK (UL(0x1F)) +#define MDCR_EL2_RES0 (GENMASK(63, 37) | \ + GENMASK(35, 30) | \ + GENMASK(25, 24) | \ + GENMASK(22, 20) | \ + BIT(18) | \ + GENMASK(16, 15)) /* For compatibility with fault code shared with 32-bit */ #define FSC_FAULT ESR_ELx_FSC_FAULT -- cgit v1.2.3 From 411d63d8c64c2f3b0c497fe4658f13b3bca951e2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 20 Aug 2021 10:46:42 +0100 Subject: KVM: arm64: Upgrade trace_kvm_arm_set_dreg32() to 64bit A number of registers pased to trace_kvm_arm_set_dreg32() are actually 64bit. Upgrade the tracepoint to take a 64bit value, despite the name... Signed-off-by: Marc Zyngier --- arch/arm64/kvm/trace_handle_exit.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 8d78acc4fba7..064a58c19f48 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -78,13 +78,17 @@ TRACE_EVENT(kvm_arm_clear_debug, TP_printk("flags: 0x%08x", __entry->guest_debug) ); +/* + * The dreg32 name is a leftover from a distant past. This will really + * output a 64bit value... + */ TRACE_EVENT(kvm_arm_set_dreg32, - TP_PROTO(const char *name, __u32 value), + TP_PROTO(const char *name, __u64 value), TP_ARGS(name, value), TP_STRUCT__entry( __field(const char *, name) - __field(__u32, value) + __field(__u64, value) ), TP_fast_assign( @@ -92,7 +96,7 @@ TRACE_EVENT(kvm_arm_set_dreg32, __entry->value = value; ), - TP_printk("%s: 0x%08x", __entry->name, __entry->value) + TP_printk("%s: 0x%llx", __entry->name, __entry->value) ); TRACE_DEFINE_SIZEOF(__u64); -- cgit v1.2.3 From 14ecf075fe5be01860927fdf3aa11d7b18023ab2 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 28 Jul 2021 15:32:32 +0000 Subject: KVM: arm64: Minor optimization of range_is_memory Currently range_is_memory finds the corresponding struct memblock_region for both the lower and upper bounds of the given address range with two rounds of binary search, and then checks that the two memblocks are the same. Simplify this by only doing binary search on the lower bound and then checking that the upper bound is in the same memblock. Signed-off-by: David Brazdil Reviewed-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210728153232.1018911-3-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index c11b50dd0050..5af2e28b9cd7 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -204,16 +204,19 @@ bool addr_is_memory(phys_addr_t phys) return find_mem_range(phys, &range); } +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + static bool range_is_memory(u64 start, u64 end) { - struct kvm_mem_range r1, r2; + struct kvm_mem_range r; - if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2)) - return false; - if (r1.start != r2.start) + if (!find_mem_range(start, &r)) return false; - return true; + return is_in_mem_range(end - 1, &r); } static inline int __host_stage2_idmap(u64 start, u64 end, -- cgit v1.2.3 From 4139b1972af281e0293c2414a0f1cd59fa5b2980 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Jul 2021 18:04:51 -0400 Subject: KVM: X86: Introduce kvm_mmu_slot_lpages() helpers Introduce kvm_mmu_slot_lpages() to calculcate lpage_info and rmap array size. The other __kvm_mmu_slot_lpages() can take an extra parameter of npages rather than fetching from the memslot pointer. Start to use the latter one in kvm_alloc_memslot_metadata(). Signed-off-by: Peter Xu Message-Id: <20210730220455.26054-4-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 ------- arch/x86/kvm/mmu.h | 21 +++++++++++++++++++++ arch/x86/kvm/mmu/page_track.c | 1 + arch/x86/kvm/x86.c | 6 ++---- 4 files changed, 24 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a612d213be5c..1881f4c92de9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -124,13 +124,6 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) -static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) -{ - /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ - return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); -} - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 83e6c6965f1e..59e831a8ab9d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -240,4 +240,25 @@ static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) return smp_load_acquire(&kvm->arch.memslots_have_rmaps); } +static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) +{ + /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ + return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); +} + +static inline unsigned long +__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages, + int level) +{ + return gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; +} + +static inline unsigned long +kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level) +{ + return __kvm_mmu_slot_lpages(slot, slot->npages, level); +} + #endif diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 91a9f7e0fd91..269f11f92fd0 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -16,6 +16,7 @@ #include +#include "mmu.h" #include "mmu_internal.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 58a72c7d3330..4e97f7cd412e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11349,8 +11349,7 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { int level = i + 1; - int lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + int lpages = __kvm_mmu_slot_lpages(slot, npages, level); WARN_ON(slot->arch.rmap[i]); @@ -11433,8 +11432,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, int lpages; int level = i + 1; - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + lpages = __kvm_mmu_slot_lpages(slot, npages, level); linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) -- cgit v1.2.3 From 3bcd0662d66fd07e596d2a7445e6b3215631b901 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Jul 2021 18:04:52 -0400 Subject: KVM: X86: Introduce mmu_rmaps_stat per-vm debugfs file Use this file to dump rmap statistic information. The statistic is done by calculating the rmap count and the result is log-2-based. An example output of this looks like (idle 6GB guest, right after boot linux): Rmap_Count: 0 1 2-3 4-7 8-15 16-31 32-63 64-127 128-255 256-511 512-1023 Level=4K: 3086676 53045 12330 1272 502 121 76 2 0 0 0 Level=2M: 5947 231 0 0 0 0 0 0 0 0 0 Level=1G: 32 0 0 0 0 0 0 0 0 0 0 Signed-off-by: Peter Xu Message-Id: <20210730220455.26054-5-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/debugfs.c | 111 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/mmu.c | 20 ++++++++ arch/x86/kvm/mmu/mmu_internal.h | 1 + 3 files changed, 132 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 95a98413dc32..54a83a744538 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -7,6 +7,8 @@ #include #include #include "lapic.h" +#include "mmu.h" +#include "mmu/mmu_internal.h" static int vcpu_get_timer_advance_ns(void *data, u64 *val) { @@ -73,3 +75,112 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ &vcpu_tsc_scaling_frac_fops); } } + +/* + * This covers statistics <1024 (11=log(1024)+1), which should be enough to + * cover RMAP_RECYCLE_THRESHOLD. + */ +#define RMAP_LOG_SIZE 11 + +static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" }; + +static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) +{ + struct kvm_rmap_head *rmap; + struct kvm *kvm = m->private; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + unsigned int lpage_size, index; + /* Still small enough to be on the stack */ + unsigned int *log[KVM_NR_PAGE_SIZES], *cur; + int i, j, k, l, ret; + + ret = -ENOMEM; + memset(log, 0, sizeof(log)); + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL); + if (!log[i]) + goto out; + } + + mutex_lock(&kvm->slots_lock); + write_lock(&kvm->mmu_lock); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + for (j = 0; j < slots->used_slots; j++) { + slot = &slots->memslots[j]; + for (k = 0; k < KVM_NR_PAGE_SIZES; k++) { + rmap = slot->arch.rmap[k]; + lpage_size = kvm_mmu_slot_lpages(slot, k + 1); + cur = log[k]; + for (l = 0; l < lpage_size; l++) { + index = ffs(pte_list_count(&rmap[l])); + if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE)) + index = RMAP_LOG_SIZE - 1; + cur[index]++; + } + } + } + } + + write_unlock(&kvm->mmu_lock); + mutex_unlock(&kvm->slots_lock); + + /* index=0 counts no rmap; index=1 counts 1 rmap */ + seq_printf(m, "Rmap_Count:\t0\t1\t"); + for (i = 2; i < RMAP_LOG_SIZE; i++) { + j = 1 << (i - 1); + k = (1 << i) - 1; + seq_printf(m, "%d-%d\t", j, k); + } + seq_printf(m, "\n"); + + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]); + cur = log[i]; + for (j = 0; j < RMAP_LOG_SIZE; j++) + seq_printf(m, "%d\t", cur[j]); + seq_printf(m, "\n"); + } + + ret = 0; +out: + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) + kfree(log[i]); + + return ret; +} + +static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + return single_open(file, kvm_mmu_rmaps_stat_show, kvm); +} + +static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + kvm_put_kvm(kvm); + + return single_release(inode, file); +} + +static const struct file_operations mmu_rmaps_stat_fops = { + .open = kvm_mmu_rmaps_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_mmu_rmaps_stat_release, +}; + +int kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm, + &mmu_rmaps_stat_fops); + return 0; +} diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d282ccf5f0c5..5daa8a910e8b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1035,6 +1035,26 @@ out: return true; } +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) +{ + struct pte_list_desc *desc; + unsigned int count = 0; + + if (!rmap_head->val) + return 0; + else if (!(rmap_head->val & 1)) + return 1; + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + while (desc) { + count += desc->spte_count; + desc = desc->more; + } + + return count; +} + static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, const struct kvm_memory_slot *slot) { diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ca7b7595bbfc..62bb8f758b3f 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -131,6 +131,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, int min_level); void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); /* * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). -- cgit v1.2.3 From 5a324c24b638d0f3194e1dc8f0cebd28a0745238 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Aug 2021 23:52:36 +0300 Subject: Revert "KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock" This together with the next patch will fix a future race between kvm_zap_gfn_range and the page fault handler, which will happen when AVIC memslot is going to be only partially disabled. The performance impact is minimal since kvm_zap_gfn_range is only called by users, update_mtrr() and kvm_post_set_cr0(). Both only use it if the guest has non-coherent DMA, in order to honor the guest's UC memtype. MTRR and CD setup only happens at boot, and generally in an area where the page tables should be small (for CD) or should not include the affected GFNs at all (for MTRRs). This is based on a patch suggested by Sean Christopherson: https://lkml.org/lkml/2021/7/22/1025 Signed-off-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 19 ++++++++----------- arch/x86/kvm/mmu/tdp_mmu.c | 15 ++++----------- arch/x86/kvm/mmu/tdp_mmu.h | 11 ++++------- 3 files changed, 16 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5daa8a910e8b..9a7199679f62 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5682,8 +5682,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush = false; + write_lock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { - write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { @@ -5703,22 +5704,18 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } if (flush) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { - flush = false; - - read_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, - gfn_end, flush, true); - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end); - - read_unlock(&kvm->mmu_lock); + gfn_end, flush); } + + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + + write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a3146434c965..fb1b2dc7a6d1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -818,21 +818,15 @@ retry: * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. - * - * If shared is true, this thread holds the MMU lock in read mode and must - * account for the possibility that other threads are modifying the paging - * structures concurrently. If shared is false, this thread should hold the - * MMU in write mode. */ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared) + gfn_t end, bool can_yield, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false) flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, - shared); + false); return flush; } @@ -843,8 +837,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) int i; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, - flush, false); + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush); if (flush) kvm_flush_remote_tlbs(kvm); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index b224d126adf9..358f447d4012 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -20,14 +20,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared); + gfn_t end, bool can_yield, bool flush); static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, - gfn_t start, gfn_t end, bool flush, - bool shared) + gfn_t start, gfn_t end, bool flush) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush, - shared); + return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -44,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) */ lockdep_assert_held_write(&kvm->mmu_lock); return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), - sp->gfn, end, false, false, false); + sp->gfn, end, false, false); } void kvm_tdp_mmu_zap_all(struct kvm *kvm); -- cgit v1.2.3 From 2822da446640d82b7bf65800314ef2a825e8df13 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:37 +0300 Subject: KVM: x86/mmu: fix parameters to kvm_flush_remote_tlbs_with_address kvm_flush_remote_tlbs_with_address expects (start gfn, number of pages), and not (start gfn, end gfn) Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9a7199679f62..3cb2808e1e5c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5703,13 +5703,17 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start); } if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, gfn_end, flush); + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start); } if (flush) -- cgit v1.2.3 From 88f585358b5e6aec8586425bdbaaa2157112ffc2 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:38 +0300 Subject: KVM: x86/mmu: add comment explaining arguments to kvm_zap_gfn_range This comment makes it clear that the range of gfns that this function receives is non inclusive. Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3cb2808e1e5c..e53d09534113 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5675,6 +5675,10 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_mmu_uninit_tdp_mmu(kvm); } +/* + * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end + * (not including it) + */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; -- cgit v1.2.3 From edb298c663fccad65fe99fcec6a4f96cc344520d Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:39 +0300 Subject: KVM: x86/mmu: bump mmu notifier count in kvm_zap_gfn_range This together with previous patch, ensures that kvm_zap_gfn_range doesn't race with page fault running on another vcpu, and will make this page fault code retry instead. This is based on a patch suggested by Sean Christopherson: https://lkml.org/lkml/2021/7/22/1025 Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++++ include/linux/kvm_host.h | 5 +++++ virt/kvm/kvm_main.c | 7 +++++-- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e53d09534113..916083eb4036 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5688,6 +5688,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_lock(&kvm->mmu_lock); + kvm_inc_notifier_count(kvm, gfn_start, gfn_end); + if (kvm_memslots_have_rmaps(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); @@ -5723,6 +5725,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (flush) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + kvm_dec_notifier_count(kvm, gfn_start, gfn_end); + write_unlock(&kvm->mmu_lock); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f50bfcf225f0..4e43843fe0d7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -991,6 +991,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif +void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end); +void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end); + long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3e81b5d8b709..8563d9b725af 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -610,7 +610,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); } -static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, +void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, unsigned long end) { /* @@ -638,6 +638,7 @@ static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, max(kvm->mmu_notifier_range_end, end); } } +EXPORT_SYMBOL_GPL(kvm_inc_notifier_count); static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) @@ -672,7 +673,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, return 0; } -static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, +void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, unsigned long end) { /* @@ -689,6 +690,8 @@ static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, */ kvm->mmu_notifier_count--; } +EXPORT_SYMBOL_GPL(kvm_dec_notifier_count); + static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *range) -- cgit v1.2.3 From 33a5c0009d14e18ca2fbf610efd6cf2e7e34489a Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:40 +0300 Subject: KVM: x86/mmu: rename try_async_pf to kvm_faultin_pfn try_async_pf is a wrong name for this function, since this code is used when asynchronous page fault is not enabled as well. This code is based on a patch from Sean Christopherson: https://lkml.org/lkml/2021/7/19/2970 Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Reviewed-by: Paolo Bonzini Message-Id: <20210810205251.424103-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 916083eb4036..e85ec37a8468 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3884,7 +3884,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, bool write, bool *writable) { @@ -3954,7 +3954,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, + if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, write, &map_writable)) return RET_PF_RETRY; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ee044d357b5f..f349eae69bf3 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -881,7 +881,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, write_fault, &map_writable)) return RET_PF_RETRY; -- cgit v1.2.3 From 8f32d5e563cbf0507756aa929134b1a110b47a62 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:41 +0300 Subject: KVM: x86/mmu: allow kvm_faultin_pfn to return page fault handling code This will allow it to return RET_PF_EMULATE for APIC mmio emulation. This code is based on a patch from Sean Christopherson: https://lkml.org/lkml/2021/7/19/2970 Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Reviewed-by: Paolo Bonzini Message-Id: <20210810205251.424103-7-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++++++++------- arch/x86/kvm/mmu/paging_tmpl.h | 4 ++-- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e85ec37a8468..38e36cff82af 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3886,7 +3886,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable) + bool write, bool *writable, int *r) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3897,7 +3897,7 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) - return true; + goto out_retry; /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { @@ -3917,14 +3917,17 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; + goto out_retry; } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) - return true; + goto out_retry; } *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable, hva); - return false; + +out_retry: + *r = RET_PF_RETRY; + return true; } static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -3955,8 +3958,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, smp_rmb(); if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, - write, &map_writable)) - return RET_PF_RETRY; + write, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f349eae69bf3..7d03e9b7ccfa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -882,8 +882,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, smp_rmb(); if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, - write_fault, &map_writable)) - return RET_PF_RETRY; + write_fault, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) return r; -- cgit v1.2.3 From 9cc13d60ba6b9976e01ec6f66fa1ec4a06992929 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:42 +0300 Subject: KVM: x86/mmu: allow APICv memslot to be enabled but invisible on AMD, APIC virtualization needs to dynamicaly inhibit the AVIC in a response to some events, and this is problematic and not efficient to do by enabling/disabling the memslot that covers APIC's mmio range. Plus due to SRCU locking, it makes it more complex to request AVIC inhibition. Instead, the APIC memslot will be always enabled, but be invisible to the guest, such as the MMU code will not install a SPTE for it, when it is inhibited and instead jump straight to emulating the access. When inhibiting the AVIC, this SPTE will be zapped. This code is based on a suggestion from Sean Christopherson: https://lkml.org/lkml/2021/7/19/2970 Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-8-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 38e36cff82af..8568ae42e867 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3899,11 +3899,24 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) goto out_retry; - /* Don't expose private memslots to L2. */ - if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { - *pfn = KVM_PFN_NOSLOT; - *writable = false; - return false; + if (!kvm_is_visible_memslot(slot)) { + /* Don't expose private memslots to L2. */ + if (is_guest_mode(vcpu)) { + *pfn = KVM_PFN_NOSLOT; + *writable = false; + return false; + } + /* + * If the APIC access page exists but is disabled, go directly + * to emulation without caching the MMIO access or creating a + * MMIO SPTE. That way the cache doesn't need to be purged + * when the AVIC is re-enabled. + */ + if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && + !kvm_apicv_activated(vcpu->kvm)) { + *r = RET_PF_EMULATE; + return true; + } } async = false; -- cgit v1.2.3 From 36222b117e36d487172c36bec187628085b92575 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:43 +0300 Subject: KVM: x86: don't disable APICv memslot when inhibited Thanks to the former patches, it is now possible to keep the APICv memslot always enabled, and it will be invisible to the guest when it is inhibited This code is based on a suggestion from Sean Christopherson: https://lkml.org/lkml/2021/7/19/2970 Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-9-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 - arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm/avic.c | 21 ++++++--------------- arch/x86/kvm/svm/svm.c | 1 - arch/x86/kvm/svm/svm.h | 1 - arch/x86/kvm/x86.c | 21 ++++++++------------- 6 files changed, 14 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index a12a4987154e..cefe1d81e2e8 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -72,7 +72,6 @@ KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) KVM_X86_OP(update_cr8_intercept) KVM_X86_OP(check_apicv_inhibit_reasons) -KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP(hwapic_irr_update) KVM_X86_OP(hwapic_isr_update) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1881f4c92de9..15eefd9498b4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1352,7 +1352,6 @@ struct kvm_x86_ops { void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); bool (*check_apicv_inhibit_reasons)(ulong bit); - void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a8ad78a2faa1..d0acbeeab3d6 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -225,31 +225,26 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, * field of the VMCB. Therefore, we set up the * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. */ -static int avic_update_access_page(struct kvm *kvm, bool activate) +static int avic_alloc_access_page(struct kvm *kvm) { void __user *ret; int r = 0; mutex_lock(&kvm->slots_lock); - /* - * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger - * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT - * memory region. So, we need to ensure that kvm->mm == current->mm. - */ - if ((kvm->arch.apic_access_memslot_enabled == activate) || - (kvm->mm != current->mm)) + + if (kvm->arch.apic_access_memslot_enabled) goto out; ret = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, - activate ? PAGE_SIZE : 0); + PAGE_SIZE); if (IS_ERR(ret)) { r = PTR_ERR(ret); goto out; } - kvm->arch.apic_access_memslot_enabled = activate; + kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); return r; @@ -270,7 +265,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (kvm_apicv_activated(vcpu->kvm)) { int ret; - ret = avic_update_access_page(vcpu->kvm, true); + ret = avic_alloc_access_page(vcpu->kvm); if (ret) return ret; } @@ -918,10 +913,6 @@ bool svm_check_apicv_inhibit_reasons(ulong bit) return supported & BIT(bit); } -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) -{ - avic_update_access_page(kvm, activate); -} static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2b6632d4c76f..b1662d6baa71 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4581,7 +4581,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_virtual_apic_mode = svm_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, - .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index bd0fe94c2920..bd41f2a32838 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -534,7 +534,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu); void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool svm_check_apicv_inhibit_reasons(ulong bit); -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate); void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e97f7cd412e..4d720a0cdd80 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9255,13 +9255,6 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -/* - * NOTE: Do not hold any lock prior to calling this. - * - * In particular, kvm_request_apicv_update() expects kvm->srcu not to be - * locked, because it calls __x86_set_memory_region() which does - * synchronize_srcu(&kvm->srcu). - */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { unsigned long old, new, expected; @@ -9282,14 +9275,16 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); } while (old != expected); - if (!!old == !!new) - return; + if (!!old != !!new) { + trace_kvm_apicv_update_request(activate, bit); + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + if (new) { + unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); - trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops.pre_update_apicv_exec_ctrl) - static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); + kvm_zap_gfn_range(kvm, gfn, gfn+1); + } + } - kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); -- cgit v1.2.3 From b0a1637f64b06586752cc507b94e4aeff02588d6 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:44 +0300 Subject: KVM: x86: APICv: fix race in kvm_request_apicv_update on SVM Currently on SVM, the kvm_request_apicv_update toggles the APICv memslot without doing any synchronization. If there is a mismatch between that memslot state and the AVIC state, on one of the vCPUs, an APIC mmio access can be lost: For example: VCPU0: enable the APIC_ACCESS_PAGE_PRIVATE_MEMSLOT VCPU1: access an APIC mmio register. Since AVIC is still disabled on VCPU1, the access will not be intercepted by it, and neither will it cause MMIO fault, but rather it will just be read/written from/to the dummy page mapped into the APIC_ACCESS_PAGE_PRIVATE_MEMSLOT. Fix that by adding a lock guarding the AVIC state changes, and carefully order the operations of kvm_request_apicv_update to avoid this race: 1. Take the lock 2. Send KVM_REQ_APICV_UPDATE 3. Update the apic inhibit reason 4. Release the lock This ensures that at (2) all vCPUs are kicked out of the guest mode, but don't yet see the new avic state. Then only after (4) all other vCPUs can update their AVIC state and resume. Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-10-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/x86.c | 39 ++++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 15eefd9498b4..20a3ffe14ff2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1053,6 +1053,9 @@ struct kvm_arch { struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; + /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */ + struct mutex apicv_update_lock; + bool apic_access_memslot_enabled; unsigned long apicv_inhibit_reasons; @@ -1736,6 +1739,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void kvm_request_apicv_update(struct kvm *kvm, bool activate, unsigned long bit); +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, + unsigned long bit); + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4d720a0cdd80..89e666e5a707 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8579,6 +8579,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); static void kvm_apicv_init(struct kvm *kvm) { + mutex_init(&kvm->arch.apicv_update_lock); + if (enable_apicv) clear_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); @@ -9240,6 +9242,8 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9252,39 +9256,44 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) */ if (!vcpu->arch.apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); + + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - unsigned long old, new, expected; + unsigned long old, new; if (!kvm_x86_ops.check_apicv_inhibit_reasons || !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; - old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); - do { - expected = new = old; - if (activate) - __clear_bit(bit, &new); - else - __set_bit(bit, &new); - if (new == old) - break; - old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); - } while (old != expected); + old = new = kvm->arch.apicv_inhibit_reasons; + + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); if (!!old != !!new) { trace_kvm_apicv_update_request(activate, bit); kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + kvm->arch.apicv_inhibit_reasons = new; if (new) { unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); - kvm_zap_gfn_range(kvm, gfn, gfn+1); } - } + } else + kvm->arch.apicv_inhibit_reasons = new; +} +EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); +void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +{ + mutex_lock(&kvm->arch.apicv_update_lock); + __kvm_request_apicv_update(kvm, activate, bit); + mutex_unlock(&kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); -- cgit v1.2.3 From 4628efcd4e8963a41e877318cd10346dea9a6e00 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:45 +0300 Subject: KVM: SVM: add warning for mistmatch between AVIC vcpu state and AVIC inhibition It is never a good idea to enter a guest on a vCPU when the AVIC inhibition state doesn't match the enablement of the AVIC on the vCPU. Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-11-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b1662d6baa71..22376fdd94f8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3779,6 +3779,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) pre_svm_run(vcpu); + WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + sync_lapic_to_cr8(vcpu); if (unlikely(svm->asid != svm->vmcb->control.asid)) { -- cgit v1.2.3 From 0f250a646382e017725001a552624be0c86527bf Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 10 Aug 2021 23:52:46 +0300 Subject: KVM: x86: hyper-v: Deactivate APICv only when AutoEOI feature is in use APICV_INHIBIT_REASON_HYPERV is currently unconditionally forced upon SynIC activation as SynIC's AutoEOI is incompatible with APICv/AVIC. It is, however, possible to track whether the feature was actually used by the guest and only inhibit APICv/AVIC when needed. TLFS suggests a dedicated 'HV_DEPRECATING_AEOI_RECOMMENDED' flag to let Windows know that AutoEOI feature should be avoided. While it's up to KVM userspace to set the flag, KVM can help a bit by exposing global APICv/AVIC enablement. Maxim: - always set HV_DEPRECATING_AEOI_RECOMMENDED in kvm_get_hv_cpuid, since this feature can be used regardless of AVIC Paolo: - use arch.apicv_update_lock to protect the hv->synic_auto_eoi_used instead of atomic ops Signed-off-by: Vitaly Kuznetsov Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-12-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/hyperv.c | 32 ++++++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 20a3ffe14ff2..28c85c80e831 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -981,6 +981,12 @@ struct kvm_hv { /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; + /* + * How many SynICs use 'AutoEOI' feature + * (protected by arch.apicv_update_lock) + */ + unsigned int synic_auto_eoi_used; + struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; }; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0b38f944c6b6..fe4a02715266 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -88,6 +88,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + int auto_eoi_old, auto_eoi_new; + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; @@ -96,10 +100,30 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, else __clear_bit(vector, synic->vec_bitmap); + auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256); + if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); + + auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256); + + if (!!auto_eoi_old == !!auto_eoi_new) + return; + + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + + if (auto_eoi_new) + hv->synic_auto_eoi_used++; + else + hv->synic_auto_eoi_used--; + + __kvm_request_apicv_update(vcpu->kvm, + !hv->synic_auto_eoi_used, + APICV_INHIBIT_REASON_HYPERV); + + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, @@ -933,12 +957,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) synic = to_hv_synic(vcpu); - /* - * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so request - * to deactivate APICV permanently. - */ - kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; synic->control = HV_SYNIC_CONTROL_ENABLE; @@ -2476,6 +2494,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; if (!cpu_smt_possible()) ent->eax |= HV_X64_NO_NONARCH_CORESHARING; + + ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. -- cgit v1.2.3 From 30eed56a7e1cbefe933a33d661827e5c72cd136f Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:47 +0300 Subject: KVM: SVM: remove svm_toggle_avic_for_irq_window Now that kvm_request_apicv_update doesn't need to drop the kvm->srcu lock, we can call kvm_request_apicv_update directly. Signed-off-by: Maxim Levitsky Reviewed-by: Paolo Bonzini Message-Id: <20210810205251.424103-13-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 11 ----------- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/svm/svm.h | 1 - 3 files changed, 2 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d0acbeeab3d6..1def54c26259 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -582,17 +582,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) -{ - if (!enable_apicv || !lapic_in_kernel(vcpu)) - return; - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_request_apicv_update(vcpu->kvm, activate, - APICV_INHIBIT_REASON_IRQWIN); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); -} - void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { return; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 22376fdd94f8..4e76e78c98ec 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2992,7 +2992,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. */ - svm_toggle_avic_for_irq_window(vcpu, true); + kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN); ++vcpu->stat.irq_window_exits; return 1; @@ -3544,7 +3544,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * via AVIC. In such case, we need to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. */ - svm_toggle_avic_for_irq_window(vcpu, false); + kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN); svm_set_vintr(svm); } } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index bd41f2a32838..aae851762b59 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -524,7 +524,6 @@ int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm); -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); -- cgit v1.2.3 From 06ef813466c63ff1a61b5f99592e58d049c2c1ac Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:48 +0300 Subject: KVM: SVM: avoid refreshing avic if its state didn't change Since AVIC can be inhibited and uninhibited rapidly it is possible that we have nothing to do by the time the svm_refresh_apicv_exec_ctrl is called. Detect and avoid this, which will be useful when we will start calling avic_vcpu_load/avic_vcpu_put when the avic inhibition state changes. Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-14-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 89e666e5a707..bf8cb1021d11 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9239,12 +9239,18 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + bool activate; + if (!lapic_in_kernel(vcpu)) return; mutex_lock(&vcpu->kvm->arch.apicv_update_lock); - vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + activate = kvm_apicv_activated(vcpu->kvm); + if (vcpu->arch.apicv_active == activate) + goto out; + + vcpu->arch.apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9257,6 +9263,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) if (!vcpu->arch.apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); +out: mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -- cgit v1.2.3 From bf5f6b9d7ad6b88df15d691d9759f9a397488c7e Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:49 +0300 Subject: KVM: SVM: move check for kvm_vcpu_apicv_active outside of avic_vcpu_{put|load} No functional change intended. Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-15-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 10 ++++------ arch/x86/kvm/svm/svm.c | 7 +++++-- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1def54c26259..e7728b16a46f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -940,9 +940,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - /* * Since the host physical APIC id is 8 bits, * we can support host APIC ID upto 255. @@ -970,9 +967,6 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) avic_update_iommu_vcpu_affinity(vcpu, -1, 0); @@ -989,6 +983,10 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) struct vcpu_svm *svm = to_svm(vcpu); svm->avic_is_running = is_run; + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + if (is_run) avic_vcpu_load(vcpu, vcpu->cpu); else diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4e76e78c98ec..114c7e29467b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1481,12 +1481,15 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) sd->current_vmcb = svm->vmcb; indirect_branch_prediction_barrier(); } - avic_vcpu_load(vcpu, cpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_load(vcpu, cpu); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { - avic_vcpu_put(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_put(vcpu); + svm_prepare_host_switch(vcpu); ++vcpu->stat.host_state_reload; -- cgit v1.2.3 From df7e4827c5490a6a0cc41341497f5267712511cf Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:50 +0300 Subject: KVM: SVM: call avic_vcpu_load/avic_vcpu_put when enabling/disabling AVIC Currently it is possible to have the following scenario: 1. AVIC is disabled by svm_refresh_apicv_exec_ctrl 2. svm_vcpu_blocking calls avic_vcpu_put which does nothing 3. svm_vcpu_unblocking enables the AVIC (due to KVM_REQ_APICV_UPDATE) and then calls avic_vcpu_load 4. warning is triggered in avic_vcpu_load since AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK was never cleared While it is possible to just remove the warning, it seems to be more robust to fully disable/enable AVIC in svm_refresh_apicv_exec_ctrl by calling the avic_vcpu_load/avic_vcpu_put Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-16-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index e7728b16a46f..01c0e83e1b71 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -651,6 +651,11 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } vmcb_mark_dirty(vmcb, VMCB_AVIC); + if (activated) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); + svm_set_pi_irte_mode(vcpu, activated); } -- cgit v1.2.3 From 73143035c214f3b0ac5cc2393197f828adeefc1e Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:51 +0300 Subject: KVM: SVM: AVIC: drop unsupported AVIC base relocation code APIC base relocation is not supported anyway and won't work correctly so just drop the code that handles it and keep AVIC MMIO bar at the default APIC base. Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-17-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 2 ++ arch/x86/kvm/svm/svm.c | 7 ------- arch/x86/kvm/svm/svm.h | 6 ------ 3 files changed, 2 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 01c0e83e1b71..8052d92069e0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -197,6 +197,8 @@ void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; + if (kvm_apicv_activated(svm->vcpu.kvm)) vmcb->control.int_ctl |= AVIC_ENABLE_MASK; else diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 114c7e29467b..7b58e445a967 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1314,9 +1314,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->virt_spec_ctrl = 0; init_vmcb(vcpu); - - if (kvm_vcpu_apicv_active(vcpu) && !init_event) - avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) @@ -2967,10 +2964,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->msr_decfg = data; break; } - case MSR_IA32_APICBASE: - if (kvm_vcpu_apicv_active(vcpu)) - avic_update_vapic_bar(to_svm(vcpu), data); - fallthrough; default: return kvm_set_msr_common(vcpu, msr); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index aae851762b59..524d943f3efc 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -503,12 +503,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) -{ - svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; - vmcb_mark_dirty(svm->vmcb, VMCB_AVIC); -} - static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); -- cgit v1.2.3 From f95937ccf5bd5e0a6bbac2b8e65a87982ffae403 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Mon, 2 Aug 2021 16:56:29 +0000 Subject: KVM: stats: Support linear and logarithmic histogram statistics Add new types of KVM stats, linear and logarithmic histogram. Histogram are very useful for observing the value distribution of time or size related stats. Signed-off-by: Jing Zhang Message-Id: <20210802165633.1866976-2-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/guest.c | 4 --- arch/mips/kvm/mips.c | 4 --- arch/powerpc/kvm/book3s.c | 4 --- arch/powerpc/kvm/booke.c | 4 --- arch/s390/kvm/kvm-s390.c | 4 --- arch/x86/kvm/x86.c | 4 --- include/linux/kvm_host.h | 90 +++++++++++++++++++++++++++++++++++++++-------- include/uapi/linux/kvm.h | 11 +++--- 8 files changed, 82 insertions(+), 43 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 1dfb83578277..5188184d25d0 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -31,8 +31,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -52,8 +50,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), STATS_DESC_COUNTER(VCPU, exits) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index af9dd029a4e1..75c6f264c626 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -85,8 +83,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits), #endif }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 79833f78d1da..5cc6e90095b0 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -43,8 +43,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, num_2M_pages), STATS_DESC_ICOUNTER(VM, num_1G_pages) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -88,8 +86,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, pthru_host), STATS_DESC_COUNTER(VCPU, pthru_bad_aff) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 551b30d84aee..5ed6c235e059 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, num_2M_pages), STATS_DESC_ICOUNTER(VM, num_1G_pages) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -79,8 +77,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, pthru_host), STATS_DESC_COUNTER(VCPU, pthru_bad_aff) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 02574d7b3612..4dc7e966a720 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -66,8 +66,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, inject_service_signal), STATS_DESC_COUNTER(VM, inject_virtio) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -174,8 +172,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), STATS_DESC_COUNTER(VCPU, pfault_sync) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf8cb1021d11..9425589f34ca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -238,8 +238,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -279,8 +277,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_ICOUNTER(VCPU, guest_mode) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4e43843fe0d7..09fc0274b1eb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1356,56 +1356,66 @@ struct _kvm_stats_desc { char name[KVM_STATS_NAME_SIZE]; }; -#define STATS_DESC_COMMON(type, unit, base, exp) \ +#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ .flags = type | unit | base | \ BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ .exponent = exp, \ - .size = 1 + .size = sz, \ + .bucket_size = bsz -#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, generic.stat) \ }, \ .name = #stat, \ } -#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ }, \ .name = #stat, \ } -#define VM_STATS_DESC(stat, type, unit, base, exp) \ +#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, stat) \ }, \ .name = #stat, \ } -#define VCPU_STATS_DESC(stat, type, unit, base, exp) \ +#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, stat) \ }, \ .name = #stat, \ } /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ -#define STATS_DESC(SCOPE, stat, type, unit, base, exp) \ - SCOPE##_STATS_DESC(stat, type, unit, base, exp) +#define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ + SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, \ + unit, base, exponent, 1, 0) #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, \ + unit, base, exponent, 1, 0) #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, \ + unit, base, exponent, 1, 0) +#define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST, \ + unit, base, exponent, sz, bsz) +#define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST, \ + unit, base, exponent, sz, 0) /* Cumulative counter, read/write */ #define STATS_DESC_COUNTER(SCOPE, name) \ @@ -1424,6 +1434,14 @@ struct _kvm_stats_desc { #define STATS_DESC_TIME_NSEC(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9) +/* Linear histogram for time in nanosecond */ +#define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz) \ + STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ + KVM_STATS_BASE_POW10, -9, sz, bsz) +/* Logarithmic histogram for time in nanosecond */ +#define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz) \ + STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ + KVM_STATS_BASE_POW10, -9, sz) #define KVM_GENERIC_VM_STATS() \ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush) @@ -1437,10 +1455,52 @@ struct _kvm_stats_desc { STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns) extern struct dentry *kvm_debugfs_dir; + ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); + +/** + * kvm_stats_linear_hist_update() - Update bucket value for linear histogram + * statistics data. + * + * @data: start address of the stats data + * @size: the number of bucket of the stats data + * @value: the new value used to update the linear histogram's bucket + * @bucket_size: the size (width) of a bucket + */ +static inline void kvm_stats_linear_hist_update(u64 *data, size_t size, + u64 value, size_t bucket_size) +{ + size_t index = div64_u64(value, bucket_size); + + index = min(index, size - 1); + ++data[index]; +} + +/** + * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram + * statistics data. + * + * @data: start address of the stats data + * @size: the number of bucket of the stats data + * @value: the new value used to update the logarithmic histogram's bucket + */ +static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) +{ + size_t index = fls64(value); + + index = min(index, size - 1); + ++data[index]; +} + +#define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize) \ + kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize) +#define KVM_STATS_LOG_HIST_UPDATE(array, value) \ + kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value) + + extern const struct kvm_stats_header kvm_vm_stats_header; extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d9e4aabcb31a..a067410ebea5 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1965,7 +1965,9 @@ struct kvm_stats_header { #define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT) -#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_PEAK +#define KVM_STATS_TYPE_LINEAR_HIST (0x3 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_LOG_HIST (0x4 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_LOG_HIST #define KVM_STATS_UNIT_SHIFT 4 #define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT) @@ -1988,8 +1990,9 @@ struct kvm_stats_header { * @size: The number of data items for this stats. * Every data item is of type __u64. * @offset: The offset of the stats to the start of stat structure in - * struture kvm or kvm_vcpu. - * @unused: Unused field for future usage. Always 0 for now. + * structure kvm or kvm_vcpu. + * @bucket_size: A parameter value used for histogram stats. It is only used + * for linear histogram stats, specifying the size of the bucket; * @name: The name string for the stats. Its size is indicated by the * &kvm_stats_header->name_size. */ @@ -1998,7 +2001,7 @@ struct kvm_stats_desc { __s16 exponent; __u16 size; __u32 offset; - __u32 unused; + __u32 bucket_size; char name[]; }; -- cgit v1.2.3 From 87bcc5fa092f82a9890f9e73e4f4c7016ef64049 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Mon, 2 Aug 2021 16:56:32 +0000 Subject: KVM: stats: Add halt_wait_ns stats for all architectures Add simple stats halt_wait_ns to record the time a VCPU has spent on waiting for all architectures (not just powerpc). Signed-off-by: Jing Zhang Message-Id: <20210802165633.1866976-5-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/book3s.c | 1 - arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/booke.c | 1 - include/linux/kvm_host.h | 3 ++- include/linux/kvm_types.h | 1 + virt/kvm/kvm_main.c | 4 ++++ 7 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9f52f282b1aa..4931d03e5799 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -103,7 +103,6 @@ struct kvm_vcpu_stat { u64 emulated_inst_exits; u64 dec_exits; u64 ext_intr_exits; - u64 halt_wait_ns; u64 halt_successful_wait; u64 dbell_exits; u64 gdbell_exits; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 5cc6e90095b0..b785f6772391 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -69,7 +69,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, emulated_inst_exits), STATS_DESC_COUNTER(VCPU, dec_exits), STATS_DESC_COUNTER(VCPU, ext_intr_exits), - STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), STATS_DESC_COUNTER(VCPU, halt_successful_wait), STATS_DESC_COUNTER(VCPU, dbell_exits), STATS_DESC_COUNTER(VCPU, gdbell_exits), diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1d1fcc290fca..813ca155561b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4144,7 +4144,7 @@ out: /* Attribute wait time */ if (do_sleep) { - vc->runner->stat.halt_wait_ns += + vc->runner->stat.generic.halt_wait_ns += ktime_to_ns(cur) - ktime_to_ns(start_wait); /* Attribute failed poll time */ if (vc->halt_poll_ns) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 5ed6c235e059..977801c83aff 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -67,7 +67,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, emulated_inst_exits), STATS_DESC_COUNTER(VCPU, dec_exits), STATS_DESC_COUNTER(VCPU, ext_intr_exits), - STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), STATS_DESC_COUNTER(VCPU, halt_successful_wait), STATS_DESC_COUNTER(VCPU, dbell_exits), STATS_DESC_COUNTER(VCPU, gdbell_exits), diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 09fc0274b1eb..58a8ffee265e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1452,7 +1452,8 @@ struct _kvm_stats_desc { STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ - STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns) + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns), \ + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns) extern struct dentry *kvm_debugfs_dir; diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index ed6a985c5680..291ef55125b2 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -87,6 +87,7 @@ struct kvm_vcpu_stat_generic { u64 halt_wakeup; u64 halt_poll_success_ns; u64 halt_poll_fail_ns; + u64 halt_wait_ns; }; #define KVM_STATS_NAME_SIZE 48 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8563d9b725af..e6fc579bb454 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3241,6 +3241,10 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } finish_rcuwait(&vcpu->wait); cur = ktime_get(); + if (waited) { + vcpu->stat.generic.halt_wait_ns += + ktime_to_ns(cur) - ktime_to_ns(poll_end); + } out: kvm_arch_vcpu_unblocking(vcpu); block_ns = ktime_to_ns(cur) - ktime_to_ns(start); -- cgit v1.2.3 From 8ccba534a1a5c6565220c81113d6157571f380cb Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Mon, 2 Aug 2021 16:56:33 +0000 Subject: KVM: stats: Add halt polling related histogram stats Add three log histogram stats to record the distribution of time spent on successful polling, failed polling and VCPU wait. halt_poll_success_hist: Distribution of spent time for a successful poll. halt_poll_fail_hist: Distribution of spent time for a failed poll. halt_wait_hist: Distribution of time a VCPU has spent on waiting. Signed-off-by: Jing Zhang Message-Id: <20210802165633.1866976-6-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 16 ++++++++++++++-- include/linux/kvm_host.h | 8 +++++++- include/linux/kvm_types.h | 5 +++++ virt/kvm/kvm_main.c | 12 ++++++++++++ 4 files changed, 38 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 813ca155561b..6d63c8e6d4f0 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4146,17 +4146,29 @@ out: if (do_sleep) { vc->runner->stat.generic.halt_wait_ns += ktime_to_ns(cur) - ktime_to_ns(start_wait); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_wait_hist, + ktime_to_ns(cur) - ktime_to_ns(start_wait)); /* Attribute failed poll time */ - if (vc->halt_poll_ns) + if (vc->halt_poll_ns) { vc->runner->stat.generic.halt_poll_fail_ns += ktime_to_ns(start_wait) - ktime_to_ns(start_poll); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_poll_fail_hist, + ktime_to_ns(start_wait) - + ktime_to_ns(start_poll)); + } } else { /* Attribute successful poll time */ - if (vc->halt_poll_ns) + if (vc->halt_poll_ns) { vc->runner->stat.generic.halt_poll_success_ns += ktime_to_ns(cur) - ktime_to_ns(start_poll); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_poll_success_hist, + ktime_to_ns(cur) - ktime_to_ns(start_poll)); + } } /* Adjust poll time */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 58a8ffee265e..e4d712e9f760 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1453,7 +1453,13 @@ struct _kvm_stats_desc { STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns), \ - STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns) + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist, \ + HALT_POLL_HIST_COUNT), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist, \ + HALT_POLL_HIST_COUNT), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ + HALT_POLL_HIST_COUNT) extern struct dentry *kvm_debugfs_dir; diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 291ef55125b2..de7fb5f364d8 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -76,6 +76,8 @@ struct kvm_mmu_memory_cache { }; #endif +#define HALT_POLL_HIST_COUNT 32 + struct kvm_vm_stat_generic { u64 remote_tlb_flush; }; @@ -88,6 +90,9 @@ struct kvm_vcpu_stat_generic { u64 halt_poll_success_ns; u64 halt_poll_fail_ns; u64 halt_wait_ns; + u64 halt_poll_success_hist[HALT_POLL_HIST_COUNT]; + u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT]; + u64 halt_wait_hist[HALT_POLL_HIST_COUNT]; }; #define KVM_STATS_NAME_SIZE 48 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e6fc579bb454..3e67c93ca403 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3222,13 +3222,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) ++vcpu->stat.generic.halt_successful_poll; if (!vcpu_valid_wakeup(vcpu)) ++vcpu->stat.generic.halt_poll_invalid; + + KVM_STATS_LOG_HIST_UPDATE( + vcpu->stat.generic.halt_poll_success_hist, + ktime_to_ns(ktime_get()) - + ktime_to_ns(start)); goto out; } cpu_relax(); poll_end = cur = ktime_get(); } while (kvm_vcpu_can_poll(cur, stop)); + + KVM_STATS_LOG_HIST_UPDATE( + vcpu->stat.generic.halt_poll_fail_hist, + ktime_to_ns(ktime_get()) - ktime_to_ns(start)); } + prepare_to_rcuwait(&vcpu->wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -3244,6 +3254,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) if (waited) { vcpu->stat.generic.halt_wait_ns += ktime_to_ns(cur) - ktime_to_ns(poll_end); + KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist, + ktime_to_ns(cur) - ktime_to_ns(poll_end)); } out: kvm_arch_vcpu_unblocking(vcpu); -- cgit v1.2.3 From 4293ddb788c1a98bdfa6479bcfd63ad5ce0a5ce6 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Mon, 2 Aug 2021 21:46:05 -0700 Subject: KVM: x86/mmu: Remove redundant spte present check in mmu_set_spte Drop an unnecessary is_shadow_present_pte() check when updating the rmaps after installing a non-MMIO SPTE. set_spte() is used only to create shadow-present SPTEs, e.g. MMIO SPTEs are handled early on, mmu_set_spte() runs with mmu_lock held for write, i.e. the SPTE can't be zapped between writing the SPTE and updating the rmaps. Opportunistically combine the "new SPTE" logic for large pages and rmaps. No functional change intended. Suggested-by: Ben Gardon Reviewed-by: David Matlack Reviewed-by: Ben Gardon Reviewed-by: Sean Christopherson Signed-off-by: Mingwei Zhang Message-Id: <20210803044607.599629-2-mizhang@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8568ae42e867..2f6458bca65d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2776,17 +2776,13 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); - if (!was_rmapped && is_large_pte(*sptep)) - ++vcpu->kvm->stat.lpages; - if (is_shadow_present_pte(*sptep)) { - if (!was_rmapped) { - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > vcpu->kvm->stat.max_mmu_rmap_size) - vcpu->kvm->stat.max_mmu_rmap_size = rmap_count; - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); - } + if (!was_rmapped) { + if (is_large_pte(*sptep)) + ++vcpu->kvm->stat.lpages; + rmap_count = rmap_add(vcpu, sptep, gfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, sptep, gfn); } return ret; -- cgit v1.2.3 From 088acd23526647844aec1c39db4ad02552c86c7b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Aug 2021 21:46:06 -0700 Subject: KVM: x86/mmu: Avoid collision with !PRESENT SPTEs in TDP MMU lpage stats Factor in whether or not the old/new SPTEs are shadow-present when adjusting the large page stats in the TDP MMU. A modified MMIO SPTE can toggle the page size bit, as bit 7 is used to store the MMIO generation, i.e. is_large_pte() can get a false positive when called on a MMIO SPTE. Ditto for nuking SPTEs with REMOVED_SPTE, which sets bit 7 in its magic value. Opportunistically move the logic below the check to verify at least one of the old/new SPTEs is shadow present. Use is/was_leaf even though is/was_present would suffice. The code generation is roughly equivalent since all flags need to be computed prior to the code in question, and using the *_leaf flags will minimize the diff in a future enhancement to account all pages, i.e. will change the check to "is_leaf != was_leaf". Reviewed-by: David Matlack Reviewed-by: Ben Gardon Fixes: 1699f65c8b65 ("kvm/x86: Fix 'lpages' kvm stat for TDM MMU") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Mingwei Zhang Message-Id: <20210803044607.599629-3-mizhang@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index fb1b2dc7a6d1..85f69558b490 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -412,6 +412,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + bool was_large, is_large; WARN_ON(level > PT64_ROOT_MAX_LEVEL); WARN_ON(level < PG_LEVEL_4K); @@ -445,13 +446,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); - if (is_large_pte(old_spte) != is_large_pte(new_spte)) { - if (is_large_pte(old_spte)) - atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); - else - atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); - } - /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -477,6 +471,18 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, return; } + /* + * Update large page stats if a large page is being zapped, created, or + * is replacing an existing shadow page. + */ + was_large = was_leaf && is_large_pte(old_spte); + is_large = is_leaf && is_large_pte(new_spte); + if (was_large != is_large) { + if (was_large) + atomic64_sub(1, (atomic64_t *)&kvm->stat.lpages); + else + atomic64_add(1, (atomic64_t *)&kvm->stat.lpages); + } if (was_leaf && is_dirty_spte(old_spte) && (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) -- cgit v1.2.3 From 71f51d2c3253645ccff69d6fa3a870f47005f0b3 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Mon, 2 Aug 2021 21:46:07 -0700 Subject: KVM: x86/mmu: Add detailed page size stats Existing KVM code tracks the number of large pages regardless of their sizes. Therefore, when large page of 1GB (or larger) is adopted, the information becomes less useful because lpages counts a mix of 1G and 2M pages. So remove the lpages since it is easy for user space to aggregate the info. Instead, provide a comprehensive page stats of all sizes from 4K to 512G. Suggested-by: Ben Gardon Reviewed-by: David Matlack Reviewed-by: Ben Gardon Signed-off-by: Mingwei Zhang Cc: Jing Zhang Cc: David Matlack Cc: Sean Christopherson Message-Id: <20210803044607.599629-4-mizhang@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 9 ++++++++- arch/x86/kvm/mmu.h | 4 ++++ arch/x86/kvm/mmu/mmu.c | 32 ++++++++++++++++---------------- arch/x86/kvm/mmu/tdp_mmu.c | 15 ++------------- arch/x86/kvm/x86.c | 4 +++- 5 files changed, 33 insertions(+), 31 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 28c85c80e831..4c4983a9378c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1214,7 +1214,14 @@ struct kvm_vm_stat { u64 mmu_recycled; u64 mmu_cache_miss; u64 mmu_unsync; - u64 lpages; + union { + struct { + atomic64_t pages_4k; + atomic64_t pages_2m; + atomic64_t pages_1g; + }; + atomic64_t pages[KVM_NR_PAGE_SIZES]; + }; u64 nx_lpage_splits; u64 max_mmu_page_hash_collisions; u64 max_mmu_rmap_size; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 59e831a8ab9d..e9688a9f7b57 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -261,4 +261,8 @@ kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level) return __kvm_mmu_slot_lpages(slot, slot->npages, level); } +static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) +{ + atomic64_add(count, &kvm->stat.pages[level - 1]); +} #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2f6458bca65d..54cb15e4b550 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -604,10 +604,11 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ -static u64 mmu_spte_clear_track_bits(u64 *sptep) +static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; + int level = sptep_to_sp(sptep)->role.level; if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, 0ull); @@ -617,6 +618,8 @@ static u64 mmu_spte_clear_track_bits(u64 *sptep) if (!is_shadow_present_pte(old_spte)) return old_spte; + kvm_update_page_stats(kvm, level, -1); + pfn = spte_to_pfn(old_spte); /* @@ -1001,14 +1004,15 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) +static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + u64 *sptep) { - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); __pte_list_remove(sptep, rmap_head); } /* Return true if rmap existed, false otherwise */ -static bool pte_list_destroy(struct kvm_rmap_head *rmap_head) +static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; @@ -1017,7 +1021,7 @@ static bool pte_list_destroy(struct kvm_rmap_head *rmap_head) return false; if (!(rmap_head->val & 1)) { - mmu_spte_clear_track_bits((u64 *)rmap_head->val); + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); goto out; } @@ -1025,7 +1029,7 @@ static bool pte_list_destroy(struct kvm_rmap_head *rmap_head) for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) - mmu_spte_clear_track_bits(desc->sptes[i]); + mmu_spte_clear_track_bits(kvm, desc->sptes[i]); next = desc->more; mmu_free_pte_list_desc(desc); } @@ -1188,7 +1192,7 @@ out: static void drop_spte(struct kvm *kvm, u64 *sptep) { - u64 old_spte = mmu_spte_clear_track_bits(sptep); + u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); @@ -1200,7 +1204,6 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) if (is_large_pte(*sptep)) { WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); - --kvm->stat.lpages; return true; } @@ -1450,7 +1453,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { - return pte_list_destroy(rmap_head); + return pte_list_destroy(kvm, rmap_head); } static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1481,13 +1484,13 @@ restart: need_flush = 1; if (pte_write(pte)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( *sptep, new_pfn); - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); mmu_spte_set(sptep, new_spte); } } @@ -2292,8 +2295,6 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); - if (is_large_pte(pte)) - --kvm->stat.lpages; } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); @@ -2778,8 +2779,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped) { - if (is_large_pte(*sptep)) - ++vcpu->kvm->stat.lpages; + kvm_update_page_stats(vcpu->kvm, level, 1); rmap_count = rmap_add(vcpu, sptep, gfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); @@ -5809,7 +5809,7 @@ restart: if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, pfn, PG_LEVEL_NUM)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 85f69558b490..db636250972a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -412,7 +412,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - bool was_large, is_large; WARN_ON(level > PT64_ROOT_MAX_LEVEL); WARN_ON(level < PG_LEVEL_4K); @@ -471,18 +470,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, return; } - /* - * Update large page stats if a large page is being zapped, created, or - * is replacing an existing shadow page. - */ - was_large = was_leaf && is_large_pte(old_spte); - is_large = is_leaf && is_large_pte(new_spte); - if (was_large != is_large) { - if (was_large) - atomic64_sub(1, (atomic64_t *)&kvm->stat.lpages); - else - atomic64_add(1, (atomic64_t *)&kvm->stat.lpages); - } + if (is_leaf != was_leaf) + kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); if (was_leaf && is_dirty_spte(old_spte) && (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9425589f34ca..4e07cae56636 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -233,7 +233,9 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, mmu_recycled), STATS_DESC_COUNTER(VM, mmu_cache_miss), STATS_DESC_ICOUNTER(VM, mmu_unsync), - STATS_DESC_ICOUNTER(VM, lpages), + STATS_DESC_ICOUNTER(VM, pages_4k), + STATS_DESC_ICOUNTER(VM, pages_2m), + STATS_DESC_ICOUNTER(VM, pages_1g), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) -- cgit v1.2.3 From 9653f2da7522c5e762e2edd2beb53170669d0a2b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Aug 2021 15:45:54 -0700 Subject: KVM: x86/mmu: Drop 'shared' param from tdp_mmu_link_page() Drop @shared from tdp_mmu_link_page() and hardcode it to work for mmu_lock being held for read. The helper has exactly one caller and in all likelihood will only ever have exactly one caller. Even if KVM adds a path to install translations without an initiating page fault, odds are very, very good that the path will just be a wrapper to the "page fault" handler (both SNP and TDX RFCs propose patches to do exactly that). No functional change intended. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210810224554.2978735-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index db636250972a..64ccfc1fa553 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -255,26 +255,17 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * * @kvm: kvm instance * @sp: the new page - * @shared: This operation may not be running under the exclusive use of - * the MMU lock and the operation must synchronize with other - * threads that might be adding or removing pages. * @account_nx: This page replaces a NX large page and should be marked for * eventual reclaim. */ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared, bool account_nx) + bool account_nx) { - if (shared) - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - else - lockdep_assert_held_write(&kvm->mmu_lock); - + spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add(&sp->link, &kvm->arch.tdp_mmu_pages); if (account_nx) account_huge_nx_page(kvm, sp); - - if (shared) - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } /** @@ -1062,7 +1053,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, !shadow_accessed_mask); if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { - tdp_mmu_link_page(vcpu->kvm, sp, true, + tdp_mmu_link_page(vcpu->kvm, sp, huge_page_disallowed && req_level >= iter.level); -- cgit v1.2.3 From 7a4bca85b23f7a573da61f161dfbf8b00e9e2955 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 11 Aug 2021 15:29:22 +0300 Subject: KVM: SVM: split svm_handle_invalid_exit Split the check for having a vmexit handler to svm_check_exit_valid, and make svm_handle_invalid_exit only handle a vmexit that is already not valid. Signed-off-by: Maxim Levitsky Message-Id: <20210811122927.900604-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7b58e445a967..e45259177009 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3236,12 +3236,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) { - if (exit_code < ARRAY_SIZE(svm_exit_handlers) && - svm_exit_handlers[exit_code]) - return 0; + return (exit_code < ARRAY_SIZE(svm_exit_handlers) && + svm_exit_handlers[exit_code]); +} +static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +{ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); dump_vmcb(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -3249,14 +3251,13 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; - - return -EINVAL; + return 0; } int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (svm_handle_invalid_exit(vcpu, exit_code)) - return 0; + if (!svm_check_exit_valid(vcpu, exit_code)) + return svm_handle_invalid_exit(vcpu, exit_code); #ifdef CONFIG_RETPOLINE if (exit_code == SVM_EXIT_MSR) -- cgit v1.2.3 From 61e5f69ef08379cdc74e8f15d3770976ed48480a Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 11 Aug 2021 15:29:26 +0300 Subject: KVM: x86: implement KVM_GUESTDBG_BLOCKIRQ KVM_GUESTDBG_BLOCKIRQ will allow KVM to block all interrupts while running. This change is mostly intended for more robust single stepping of the guest and it has the following benefits when enabled: * Resuming from a breakpoint is much more reliable. When resuming execution from a breakpoint, with interrupts enabled, more often than not, KVM would inject an interrupt and make the CPU jump immediately to the interrupt handler and eventually return to the breakpoint, to trigger it again. From the user point of view it looks like the CPU never executed a single instruction and in some cases that can even prevent forward progress, for example, when the breakpoint is placed by an automated script (e.g lx-symbols), which does something in response to the breakpoint and then continues the guest automatically. If the script execution takes enough time for another interrupt to arrive, the guest will be stuck on the same breakpoint RIP forever. * Normal single stepping is much more predictable, since it won't land the debugger into an interrupt handler. * RFLAGS.TF has less chance to be leaked to the guest: We set that flag behind the guest's back to do single stepping but if single step lands us into an interrupt/exception handler it will be leaked to the guest in the form of being pushed to the stack. This doesn't completely eliminate this problem as exceptions can still happen, but at least this reduces the chances of this happening. Signed-off-by: Maxim Levitsky Message-Id: <20210811122927.900604-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 1 + arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/x86.c | 4 ++++ 4 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 86d7ad3a126c..4ea1bb28297b 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3357,6 +3357,7 @@ flags which can include the following: - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] + - KVM_GUESTDBG_BLOCKIRQ: avoid injecting interrupts/NMI/SMI [x86] For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints are enabled in memory so we need to ensure breakpoint exceptions are diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4c4983a9378c..7723865077fd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -222,7 +222,8 @@ enum x86_intercept_stage; KVM_GUESTDBG_USE_HW_BP | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_INJECT_BP | \ - KVM_GUESTDBG_INJECT_DB) + KVM_GUESTDBG_INJECT_DB | \ + KVM_GUESTDBG_BLOCKIRQ) #define PFERR_PRESENT_BIT 0 diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a6c327f8ad9e..2ef1f6513c68 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -295,6 +295,7 @@ struct kvm_debug_exit_arch { #define KVM_GUESTDBG_USE_HW_BP 0x00020000 #define KVM_GUESTDBG_INJECT_DB 0x00040000 #define KVM_GUESTDBG_INJECT_BP 0x00080000 +#define KVM_GUESTDBG_BLOCKIRQ 0x00100000 /* for KVM_SET_GUEST_DEBUG */ struct kvm_guest_debug_arch { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e07cae56636..1a00af1b076b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8892,6 +8892,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) can_inject = false; } + /* Don't inject interrupts if the user asked to avoid doing so */ + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) + return 0; + /* * Finally, inject interrupt events. If an event cannot be injected * due to architectural conditions (e.g. IF=0) a window-open exit -- cgit v1.2.3 From ec607a564f70519b340f7eb4cfc0f4a6b55285ac Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 6 Aug 2021 07:05:58 -0400 Subject: KVM: x86: clamp host mapping level to max_level in kvm_mmu_max_mapping_level This change started as a way to make kvm_mmu_hugepage_adjust a bit simpler, but it does fix two bugs as well. One bug is in zapping collapsible PTEs. If a large page size is disallowed but not all of them, kvm_mmu_max_mapping_level will return the host mapping level and the small PTEs will be zapped up to that level. However, if e.g. 1GB are prohibited, we can still zap 4KB mapping and preserve the 2MB ones. This can happen for example when NX huge pages are in use. The second would happen when userspace backs guest memory with a 1gb hugepage but only assign a subset of the page to the guest. 1gb pages would be disallowed by the memslot, but not 2mb. kvm_mmu_max_mapping_level() would fall through to the host_pfn_mapping_level() logic, see the 1gb hugepage, and map the whole thing into the guest. Fixes: 2f57b7051fe8 ("KVM: x86/mmu: Persist gfn_lpage_is_disallowed() to max_level") Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 54cb15e4b550..bfd2705a7291 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2910,6 +2910,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, int max_level) { struct kvm_lpage_info *linfo; + int host_level; max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { @@ -2921,7 +2922,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - return host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + return min(host_level, max_level); } int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2945,17 +2947,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); - if (level == PG_LEVEL_4K) - return level; - - *req_level = level = min(level, max_level); - /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - if (huge_page_disallowed) + *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); + if (level == PG_LEVEL_4K || huge_page_disallowed) return PG_LEVEL_4K; /* -- cgit v1.2.3 From 746700d21fd52399c97aeb7791584bbf5426983c Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 18 Aug 2021 11:55:47 -0500 Subject: KVM: x86: Allow CPU to force vendor-specific TDP level AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force a fixed TDP level. Signed-off-by: Wei Huang Message-Id: <20210818165549.3771014-2-wei.huang2@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 ++--- arch/x86/kvm/mmu/mmu.c | 10 ++++++++-- arch/x86/kvm/svm/svm.c | 4 +++- arch/x86/kvm/vmx/vmx.c | 3 ++- 4 files changed, 15 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7723865077fd..24dac52b7bde 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -717,7 +717,6 @@ struct kvm_vcpu_arch { u64 reserved_gpa_bits; int maxphyaddr; - int max_tdp_level; /* emulate context */ @@ -1766,8 +1765,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level); +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bfd2705a7291..b2e68ce8b722 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); bool tdp_enabled = false; static int max_huge_page_level __read_mostly; +static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; enum { @@ -4684,6 +4685,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + /* tdp_root_level is architecture forced level, use it if nonzero */ + if (tdp_root_level) + return tdp_root_level; + /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; @@ -5375,10 +5380,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) */ } -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; + tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e45259177009..bcffae2e36d2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1013,7 +1013,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); + /* Force VM NPT level equal to the host's max NPT level */ + kvm_configure_mmu(npt_enabled, get_max_npt_level(), + get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Note, SEV setup consumes npt_enabled. */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cd913100b300..fada1055f325 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7820,7 +7820,8 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT -- cgit v1.2.3 From cb0f722aff6e9ba970a9fee9263c7821bbe811de Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 18 Aug 2021 11:55:48 -0500 Subject: KVM: x86/mmu: Support shadowing NPT when 5-level paging is enabled in host When the 5-level page table CPU flag is set in the host, but the guest has CR4.LA57=0 (including the case of a 32-bit guest), the top level of the shadow NPT page tables will be fixed, consisting of one pointer to a lower-level table and 511 non-present entries. Extend the existing code that creates the fixed PML4 or PDP table, to provide a fixed PML5 table if needed. This is not needed on EPT because the number of layers in the tables is specified in the EPTP instead of depending on the host CR4. Suggested-by: Paolo Bonzini Signed-off-by: Wei Huang Message-Id: <20210818165549.3771014-3-wei.huang2@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 53 ++++++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 24dac52b7bde..0532f4e84308 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -441,6 +441,7 @@ struct kvm_mmu { u64 *pae_root; u64 *pml4_root; + u64 *pml5_root; /* * check zero bits on shadow page table entries, these diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b2e68ce8b722..c04e30f6e0db 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3536,15 +3536,22 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK | shadow_me_mask; - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } - mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; + + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { + if (WARN_ON_ONCE(!mmu->pml5_root)) { + r = -EIO; + goto out_unlock; + } + mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; + } } for (i = 0; i < 4; ++i) { @@ -3563,7 +3570,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) mmu->pae_root[i] = root | pm_mask; } - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + mmu->root_hpa = __pa(mmu->pml5_root); + else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) mmu->root_hpa = __pa(mmu->pml4_root); else mmu->root_hpa = __pa(mmu->pae_root); @@ -3579,7 +3588,9 @@ out_unlock: static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 *pml4_root, *pae_root; + u64 *pml5_root = NULL; + u64 *pml4_root = NULL; + u64 *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP @@ -3591,21 +3602,15 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) mmu->shadow_root_level < PT64_ROOT_4LEVEL) return 0; - /* - * This mess only works with 4-level paging and needs to be updated to - * work with 5-level paging. - */ - if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) - return -EIO; - - if (mmu->pae_root && mmu->pml4_root) + if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ - if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root)) + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || + mmu->pml5_root)) return -EIO; /* @@ -3616,16 +3621,31 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (!pae_root) return -ENOMEM; +#ifdef CONFIG_X86_64 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) { - free_page((unsigned long)pae_root); - return -ENOMEM; + if (!pml4_root) + goto err_pml4; + + if (mmu->shadow_root_level > PT64_ROOT_4LEVEL) { + pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml5_root) + goto err_pml5; } +#endif mmu->pae_root = pae_root; mmu->pml4_root = pml4_root; + mmu->pml5_root = pml5_root; return 0; + +#ifdef CONFIG_X86_64 +err_pml5: + free_page((unsigned long)pml4_root); +err_pml4: + free_page((unsigned long)pae_root); + return -ENOMEM; +#endif } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -5461,6 +5481,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->pml4_root); + free_page((unsigned long)mmu->pml5_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) -- cgit v1.2.3 From 43e540cc9f2ca12a2364ddf64e5ef929a546550d Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 18 Aug 2021 11:55:49 -0500 Subject: KVM: SVM: Add 5-level page table support for SVM When the 5-level page table is enabled on host OS, the nested page table for guest VMs must use 5-level as well. Update get_npt_level() function to reflect this requirement. In the meanwhile, remove the code that prevents kvm-amd driver from being loaded when 5-level page table is detected. Signed-off-by: Wei Huang Message-Id: <20210818165549.3771014-4-wei.huang2@amd.com> [Tweak condition as suggested by Sean. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bcffae2e36d2..1a70e11f0487 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -259,7 +259,7 @@ u32 svm_msrpm_offset(u32 msr) static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 - return PT64_ROOT_4LEVEL; + return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; #else return PT32E_ROOT_LEVEL; #endif @@ -460,11 +460,6 @@ static int has_svm(void) return 0; } - if (pgtable_l5_enabled()) { - pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n"); - return 0; - } - return 1; } -- cgit v1.2.3 From 8ce8a6fce9bfd3fcabe230ad104e2caf08b2e58d Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Mon, 23 Aug 2021 22:39:40 +0000 Subject: KVM: arm64: Trim guest debug exception handling The switch-case for handling guest debug exception covers all the debug exception classes, but functionally, doesn't do anything with them other than ESR_ELx_EC_WATCHPT_LOW. Moreover, even though handled well, the 'default' case could be confusing from a security point of view, stating that the guests' actions can potentially flood the syslog. But in reality, the code is unreachable. Hence, trim down the function to only handle the case with ESR_ELx_EC_WATCHPT_LOW with a simple 'if' check. Suggested-by: Marc Zyngier Signed-off-by: Raghavendra Rao Ananta Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210823223940.1878930-1-rananta@google.com --- arch/arm64/kvm/handle_exit.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 6f48336b1d86..b143f1ab6520 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -113,34 +113,20 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) * guest and host are using the same debug facilities it will be up to * userspace to re-inject the correct exception for guest delivery. * - * @return: 0 (while setting vcpu->run->exit_reason), -1 for error + * @return: 0 (while setting vcpu->run->exit_reason) */ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 esr = kvm_vcpu_get_esr(vcpu); - int ret = 0; run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = esr; - switch (ESR_ELx_EC(esr)) { - case ESR_ELx_EC_WATCHPT_LOW: + if (ESR_ELx_EC(esr) == ESR_ELx_EC_WATCHPT_LOW) run->debug.arch.far = vcpu->arch.fault.far_el2; - fallthrough; - case ESR_ELx_EC_SOFTSTP_LOW: - case ESR_ELx_EC_BREAKPT_LOW: - case ESR_ELx_EC_BKPT32: - case ESR_ELx_EC_BRK64: - break; - default: - kvm_err("%s: un-handled case esr: %#08x\n", - __func__, (unsigned int) esr); - ret = -1; - break; - } - return ret; + return 0; } static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 7119decf47d9867266459615be502e5d2cecedba Mon Sep 17 00:00:00 2001 From: Janis Schoetterl-Glausch Date: Tue, 29 Jun 2021 10:55:30 +0200 Subject: KVM: s390: Enable specification exception interpretation When this feature is enabled the hardware is free to interpret specification exceptions generated by the guest, instead of causing program interruption interceptions. This benefits (test) programs that generate a lot of specification exceptions (roughly 4x increase in exceptions/sec). Interceptions will occur as before if ICTL_PINT is set, i.e. if guest debug is enabled. There is no indication if this feature is available or not and the hardware is free to interpret or not. So we can simply set this bit and if the hardware ignores it we fall back to intercept 8 handling. Signed-off-by: Janis Schoetterl-Glausch Link: https://lore.kernel.org/linux-s390/20210706114714.3936825-1-scgl@linux.ibm.com/ Reviewed-by: David Hildenbrand Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 2 ++ 3 files changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8925f3969478..118d5450c523 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -244,6 +244,7 @@ struct kvm_s390_sie_block { __u8 fpf; /* 0x0060 */ #define ECB_GS 0x40 #define ECB_TE 0x10 +#define ECB_SPECI 0x08 #define ECB_SRSI 0x04 #define ECB_HOSTPROTINT 0x02 __u8 ecb; /* 0x0061 */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f72f361d39dd..5b45c83ced21 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3180,6 +3180,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= ECB_SRSI; if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= ECB_TE; + if (!kvm_is_ucontrol(vcpu->kvm)) + vcpu->arch.sie_block->ecb |= ECB_SPECI; if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi) vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI; diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 4002a24bc43a..acda4b6fc851 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -510,6 +510,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix_unmapped(vsie_page); scb_s->ecb |= ECB_TE; } + /* specification exception interpretation */ + scb_s->ecb |= scb_o->ecb & ECB_SPECI; /* branch prediction */ if (test_kvm_facility(vcpu->kvm, 82)) scb_s->fpf |= scb_o->fpf & FPF_BPBC; -- cgit v1.2.3 From a3e03bc1368c1bc16e19b001fc96dc7430573cc8 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Fri, 27 Aug 2021 14:54:29 +0200 Subject: KVM: s390: index kvm->arch.idle_mask by vcpu_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While in practice vcpu->vcpu_idx == vcpu->vcp_id is often true, it may not always be, and we must not rely on this. Reason is that KVM decides the vcpu_idx, userspace decides the vcpu_id, thus the two might not match. Currently kvm->arch.idle_mask is indexed by vcpu_id, which implies that code like for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { vcpu = kvm_get_vcpu(kvm, vcpu_id); do_stuff(vcpu); } is not legit. Reason is that kvm_get_vcpu expects an vcpu_idx, not an vcpu_id. The trouble is, we do actually use kvm->arch.idle_mask like this. To fix this problem we have two options. Either use kvm_get_vcpu_by_id(vcpu_id), which would loop to find the right vcpu_id, or switch to indexing via vcpu_idx. The latter is preferable for obvious reasons. Let us make switch from indexing kvm->arch.idle_mask by vcpu_id to indexing it by vcpu_idx. To keep gisa_int.kicked_mask indexed by the same index as idle_mask lets make the same change for it as well. Fixes: 1ee0bc559dc3 ("KVM: s390: get rid of local_int array") Signed-off-by: Halil Pasic Reviewed-by: Christian Bornträger Reviewed-by: Claudio Imbrenda Cc: # 3.15+ Link: https://lore.kernel.org/r/20210827125429.1912577-1-pasic@linux.ibm.com Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/interrupt.c | 12 ++++++------ arch/s390/kvm/kvm-s390.c | 2 +- arch/s390/kvm/kvm-s390.h | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 118d5450c523..611f18ecde91 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -963,6 +963,7 @@ struct kvm_arch{ atomic64_t cmma_dirty_pages; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + /* indexed by vcpu_idx */ DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; struct kvm_s390_pv pv; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index d548d60caed2..16256e17a544 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) { - int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_vcpu *vcpu; - for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { - vcpu = kvm_get_vcpu(kvm, vcpu_id); + for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_idx); if (psw_ioint_disabled(vcpu)) continue; deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); if (deliverable_mask) { /* lately kicked but not yet running */ - if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) return; kvm_s390_vcpu_wakeup(vcpu); return; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5b45c83ced21..e144c8046ceb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4026,7 +4026,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 9fad25109b0d..ecd741ee3276 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) -- cgit v1.2.3 From a717a780fc4e1dddd3fbf9ad08f180eec2a78194 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Aug 2021 17:58:24 -0700 Subject: KVM: x86/mmu: Don't freak out if pml5_root is NULL on 4-level host Include pml5_root in the set of special roots if and only if the host, and thus NPT, is using 5-level paging. mmu_alloc_special_roots() expects special roots to be allocated as a bundle, i.e. they're either all valid or all NULL. But for pml5_root, that expectation only holds true if the host uses 5-level paging, which causes KVM to WARN about pml5_root being NULL when the other special roots are valid. The silver lining of 4-level vs. 5-level NPT being tied to the host kernel's paging level is that KVM's shadow root level is constant; unlike VMX's EPT, KVM can't choose 4-level NPT based on guest.MAXPHYADDR. That means KVM can still expect pml5_root to be bundled with the other special roots, it just needs to be conditioned on the shadow root level. Fixes: cb0f722aff6e ("KVM: x86/mmu: Support shadowing NPT when 5-level paging is enabled in host") Reported-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20210824005824.205536-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c04e30f6e0db..e7bac871747c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3588,6 +3588,7 @@ out_unlock: static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; + bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL; u64 *pml5_root = NULL; u64 *pml4_root = NULL; u64 *pae_root; @@ -3602,7 +3603,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) mmu->shadow_root_level < PT64_ROOT_4LEVEL) return 0; - if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root) + /* + * NPT, the only paging mode that uses this horror, uses a fixed number + * of levels for the shadow page tables, e.g. all MMUs are 4-level or + * all MMus are 5-level. Thus, this can safely require that pml5_root + * is allocated if the other roots are valid and pml5 is needed, as any + * prior MMU would also have required pml5. + */ + if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) return 0; /* @@ -3610,7 +3618,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) * bail if KVM ends up in a state where only one of the roots is valid. */ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || - mmu->pml5_root)) + (need_pml5 && mmu->pml5_root))) return -EIO; /* @@ -3626,7 +3634,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (!pml4_root) goto err_pml4; - if (mmu->shadow_root_level > PT64_ROOT_4LEVEL) { + if (need_pml5) { pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml5_root) goto err_pml5; -- cgit v1.2.3 From 81b4b56d4f8130bbb99cf4e2b48082e5b4cfccb9 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 26 Aug 2021 12:57:49 +0300 Subject: KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation If we are emulating an invalid guest state, we don't have a correct exit reason, and thus we shouldn't do anything in this function. Signed-off-by: Maxim Levitsky Message-Id: <20210826095750.1650467-2-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Fixes: 95b5a48c4f2b ("KVM: VMX: Handle NMIs, #MCs and async #PFs in common irqs-disabled fn", 2019-06-18) Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fada1055f325..0c2c0d5ae873 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6382,6 +6382,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vmx->emulation_required) + return; + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) -- cgit v1.2.3 From 4ddacd525a2f16cebec8ba409fbce6b6fb45e987 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Fri, 3 Sep 2021 17:15:58 -0400 Subject: kvm: x86: Set KVM_MAX_VCPU_ID to 4*KVM_MAX_VCPUS Instead of requiring KVM_MAX_VCPU_ID to be manually increased every time we increase KVM_MAX_VCPUS, set it to 4*KVM_MAX_VCPUS. This should be enough for CPU topologies where Cores-per-Package and Packages-per-Socket are not powers of 2. In practice, this increases KVM_MAX_VCPU_ID from 1023 to 1152. The only side effect of this change is making some fields in struct kvm_ioapic larger, increasing the struct size from 1628 to 1780 bytes (in x86_64). Signed-off-by: Eduardo Habkost Message-Id: <20210903211600.2002377-2-ehabkost@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0532f4e84308..59da1fbf5248 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -39,7 +39,19 @@ #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 -#define KVM_MAX_VCPU_ID 1023 + +/* + * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs + * might be larger than the actual number of VCPUs because the + * APIC ID encodes CPU topology information. + * + * In the worst case, we'll need less than one extra bit for the + * Core ID, and less than one extra bit for the Package (Die) ID, + * so ratio of 4 should be enough. + */ +#define KVM_VCPU_ID_RATIO 4 +#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) + /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 -- cgit v1.2.3 From 074c82c8f7cf8a46c3b81965f122599e3a133450 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Fri, 3 Sep 2021 17:15:59 -0400 Subject: kvm: x86: Increase MAX_VCPUS to 1024 Increase KVM_MAX_VCPUS to 1024, so we can test larger VMs. I'm not changing KVM_SOFT_MAX_VCPUS yet because I'm afraid it might involve complicated questions around the meaning of "supported" and "recommended" in the upstream tree. KVM_SOFT_MAX_VCPUS will be changed in a separate patch. For reference, visible effects of this change are: - KVM_CAP_MAX_VCPUS will now return 1024 (of course) - Default value for CPUID[HYPERV_CPUID_IMPLEMENT_LIMITS (00x40000005)].EAX will now be 1024 - KVM_MAX_VCPU_ID will change from 1151 to 4096 - Size of struct kvm will increase from 19328 to 22272 bytes (in x86_64) - Size of struct kvm_ioapic will increase from 1780 to 5084 bytes (in x86_64) - Bitmap stack variables that will grow: - At kvm_hv_flush_tlb() kvm_hv_send_ipi(), vp_bitmap[] and vcpu_bitmap[] will now be 128 bytes long - vcpu_bitmap at bioapic_write_indirect() will be 128 bytes long once patch "KVM: x86: Fix stack-out-of-bounds memory access from ioapic_write_indirect()" is applied Signed-off-by: Eduardo Habkost Message-Id: <20210903211600.2002377-3-ehabkost@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 59da1fbf5248..52ca8da4ab52 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -37,7 +37,7 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS -#define KVM_MAX_VCPUS 288 +#define KVM_MAX_VCPUS 1024 #define KVM_SOFT_MAX_VCPUS 240 /* -- cgit v1.2.3 From 1dbaf04cb91b2b680d10f9a8f9f823d94c7087bf Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Fri, 3 Sep 2021 17:16:00 -0400 Subject: kvm: x86: Increase KVM_SOFT_MAX_VCPUS to 710 Support for 710 VCPUs was tested by Red Hat since RHEL-8.4, so increase KVM_SOFT_MAX_VCPUS to 710. Signed-off-by: Eduardo Habkost Message-Id: <20210903211600.2002377-4-ehabkost@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 52ca8da4ab52..f8f48a7ec577 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -38,7 +38,7 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS #define KVM_MAX_VCPUS 1024 -#define KVM_SOFT_MAX_VCPUS 240 +#define KVM_SOFT_MAX_VCPUS 710 /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs -- cgit v1.2.3 From 678a305b85d95f288c12e3d69a32d3351b34f2bb Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 30 Aug 2021 22:53:36 +0800 Subject: KVM: x86/mmu: Remove unused field mmio_cached in struct kvm_mmu_page After reverting and restoring the fast tlb invalidation patch series, the mmio_cached is not removed. Hence a unused field is left in kvm_mmu_page. Cc: Sean Christopherson Signed-off-by: Jia He Message-Id: <20210830145336.27183-1-justin.he@arm.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu_internal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 62bb8f758b3f..6b6f10895710 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -37,7 +37,6 @@ struct kvm_mmu_page { bool unsync; u8 mmu_valid_gen; - bool mmio_cached; bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* -- cgit v1.2.3 From e7177339d7b5f9594b316842122b5fda9513d5e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 31 Aug 2021 09:42:22 -0700 Subject: Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()" Revert a misguided illegal GPA check when "translating" a non-nested GPA. The check is woefully incomplete as it does not fill in @exception as expected by all callers, which leads to KVM attempting to inject a bogus exception, potentially exposing kernel stack information in the process. WARNING: CPU: 0 PID: 8469 at arch/x86/kvm/x86.c:525 exception_type+0x98/0xb0 arch/x86/kvm/x86.c:525 CPU: 1 PID: 8469 Comm: syz-executor531 Not tainted 5.14.0-rc7-syzkaller #0 RIP: 0010:exception_type+0x98/0xb0 arch/x86/kvm/x86.c:525 Call Trace: x86_emulate_instruction+0xef6/0x1460 arch/x86/kvm/x86.c:7853 kvm_mmu_page_fault+0x2f0/0x1810 arch/x86/kvm/mmu/mmu.c:5199 handle_ept_misconfig+0xdf/0x3e0 arch/x86/kvm/vmx/vmx.c:5336 __vmx_handle_exit arch/x86/kvm/vmx/vmx.c:6021 [inline] vmx_handle_exit+0x336/0x1800 arch/x86/kvm/vmx/vmx.c:6038 vcpu_enter_guest+0x2a1c/0x4430 arch/x86/kvm/x86.c:9712 vcpu_run arch/x86/kvm/x86.c:9779 [inline] kvm_arch_vcpu_ioctl_run+0x47d/0x1b20 arch/x86/kvm/x86.c:10010 kvm_vcpu_ioctl+0x49e/0xe50 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3652 The bug has escaped notice because practically speaking the GPA check is useless. The GPA check in question only comes into play when KVM is walking guest page tables (or "translating" CR3), and KVM already handles illegal GPA checks by setting reserved bits in rsvd_bits_mask for each PxE, or in the case of CR3 for loading PTDPTRs, manually checks for an illegal CR3. This particular failure doesn't hit the existing reserved bits checks because syzbot sets guest.MAXPHYADDR=1, and IA32 architecture simply doesn't allow for such an absurd MAXPHYADDR, e.g. 32-bit paging doesn't define any reserved PA bits checks, which KVM emulates by only incorporating the reserved PA bits into the "high" bits, i.e. bits 63:32. Simply remove the bogus check. There is zero meaningful value and no architectural justification for supporting guest.MAXPHYADDR < 32, and properly filling the exception would introduce non-trivial complexity. This reverts commit ec7771ab471ba6a945350353617e2e3385d0e013. Fixes: ec7771ab471b ("KVM: x86: mmu: Add guest physical address check in translate_gpa()") Cc: stable@vger.kernel.org Reported-by: syzbot+200c08e88ae818f849ce@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Message-Id: <20210831164224.1119728-2-seanjc@google.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e7bac871747c..9b0cdec8b62d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -334,12 +334,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { - /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { - exception->error_code |= PFERR_RSVD_MASK; - return UNMAPPED_GVA; - } - return gpa; } -- cgit v1.2.3 From ca41c34cab1f50f13ab5ac95739483871637a684 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 15:10:22 -0700 Subject: KVM: x86/mmu: Relocate kvm_mmu_page.tdp_mmu_page for better cache locality Move "tdp_mmu_page" into the 1-byte void left by the recently removed "mmio_cached" so that it resides in the first 64 bytes of kvm_mmu_page, i.e. in the same cache line as the most commonly accessed fields. Don't bother wrapping tdp_mmu_page in CONFIG_X86_64, including the field in 32-bit builds doesn't affect the size of kvm_mmu_page, and a future patch can always wrap the field in the unlikely event KVM gains a 1-byte flag that is 32-bit specific. Note, the size of kvm_mmu_page is also unchanged on CONFIG_X86_64=y due to it previously sharing an 8-byte chunk with write_flooding_count. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210901221023.1303578-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu_internal.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 6b6f10895710..4e7b634efa38 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -35,6 +35,7 @@ struct kvm_mmu_page { struct hlist_node hash_link; struct list_head lpage_disallowed_link; + bool tdp_mmu_page; bool unsync; u8 mmu_valid_gen; bool lpage_disallowed; /* Can't be replaced by an equiv large page */ @@ -70,8 +71,6 @@ struct kvm_mmu_page { atomic_t write_flooding_count; #ifdef CONFIG_X86_64 - bool tdp_mmu_page; - /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif -- cgit v1.2.3 From 1148bfc47be3f885a10945ecbc1e3789a0313603 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 15:10:23 -0700 Subject: KVM: x86/mmu: Move lpage_disallowed_link further "down" in kvm_mmu_page Move "lpage_disallowed_link" out of the first 64 bytes, i.e. out of the first cache line, of kvm_mmu_page so that "spt" and to a lesser extent "gfns" land in the first cache line. "lpage_disallowed_link" is accessed relatively infrequently compared to "spt", which is accessed any time KVM is walking and/or manipulating the shadow page tables. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210901221023.1303578-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu_internal.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 4e7b634efa38..bf2bdbf333c2 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -31,9 +31,12 @@ extern bool dbg; #define IS_VALID_PAE_ROOT(x) (!!(x)) struct kvm_mmu_page { + /* + * Note, "link" through "spt" fit in a single 64 byte cache line on + * 64-bit kernels, keep it that way unless there's a reason not to. + */ struct list_head link; struct hlist_node hash_link; - struct list_head lpage_disallowed_link; bool tdp_mmu_page; bool unsync; @@ -59,6 +62,7 @@ struct kvm_mmu_page { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ DECLARE_BITMAP(unsync_child_bitmap, 512); + struct list_head lpage_disallowed_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an -- cgit v1.2.3 From 3cc4e148b96263313e3dce926eae569c942bb74e Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Tue, 17 Aug 2021 00:26:39 +0000 Subject: KVM: stats: Add VM stat for remote tlb flush requests Add a new stat that counts the number of times a remote TLB flush is requested, regardless of whether it kicks vCPUs out of guest mode. This allows us to look at how often flushes are initiated. Unlike remote_tlb_flush, this one applies to ARM's instruction-set-based TLB flush implementation, so apply it there too. Original-by: David Matlack Signed-off-by: Jing Zhang Message-Id: <20210817002639.3856694-1-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/mmu.c | 1 + include/linux/kvm_host.h | 3 ++- include/linux/kvm_types.h | 1 + virt/kvm/kvm_main.c | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0625bf2353c2..f5bb235bbb59 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -80,6 +80,7 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) */ void kvm_flush_remote_tlbs(struct kvm *kvm) { + ++kvm->stat.generic.remote_tlb_flush_requests; kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e4d712e9f760..c177789a8542 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1444,7 +1444,8 @@ struct _kvm_stats_desc { KVM_STATS_BASE_POW10, -9, sz) #define KVM_GENERIC_VM_STATS() \ - STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush) + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush), \ + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests) #define KVM_GENERIC_VCPU_STATS() \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll), \ diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index de7fb5f364d8..2237abb93ccd 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -80,6 +80,7 @@ struct kvm_mmu_memory_cache { struct kvm_vm_stat_generic { u64 remote_tlb_flush; + u64 remote_tlb_flush_requests; }; struct kvm_vcpu_stat_generic { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 140c7d311021..305956310174 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -318,6 +318,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) */ long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); + ++kvm->stat.generic.remote_tlb_flush_requests; /* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. -- cgit v1.2.3 From a40b2fd064bb7c0425c7cbdca2120cf0609a90a2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 14 Aug 2021 11:51:29 +0800 Subject: x86/kvm: Don't enable IRQ when IRQ enabled in kvm_wait Commit f4e61f0c9add3 ("x86/kvm: Fix broken irq restoration in kvm_wait") replaced "local_irq_restore() when IRQ enabled" with "local_irq_enable() when IRQ enabled" to suppress a warnning. Although there is no similar debugging warnning for doing local_irq_enable() when IRQ enabled as doing local_irq_restore() in the same IRQ situation. But doing local_irq_enable() when IRQ enabled is no less broken as doing local_irq_restore() and we'd better avoid it. Cc: Mark Rutland Cc: Peter Zijlstra (Intel) Signed-off-by: Lai Jiangshan Message-Id: <20210814035129.154242-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a26643dc6bd6..b656456c3a94 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -884,10 +884,11 @@ static void kvm_wait(u8 *ptr, u8 val) } else { local_irq_disable(); + /* safe_halt() will enable IRQ */ if (READ_ONCE(*ptr) == val) safe_halt(); - - local_irq_enable(); + else + local_irq_enable(); } } -- cgit v1.2.3 From a3cf527e70bd1553500746ef2f38db41e2af1d9e Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 6 Apr 2021 10:49:11 +0800 Subject: KVM: MIPS: Remove a "set but not used" variable This fix a build warning: arch/mips/kvm/vz.c: In function '_kvm_vz_restore_htimer': >> arch/mips/kvm/vz.c:392:10: warning: variable 'freeze_time' set but not used [-Wunused-but-set-variable] 392 | ktime_t freeze_time; | ^~~~~~~~~~~ Reported-by: kernel test robot Signed-off-by: Huacai Chen Message-Id: <20210406024911.2008046-1-chenhuacai@loongson.cn> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/vz.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 43cad10b877d..4adca5abbc72 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -388,7 +388,6 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu, u32 compare, u32 cause) { u32 start_count, after_count; - ktime_t freeze_time; unsigned long flags; /* @@ -396,7 +395,7 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu, * this with interrupts disabled to avoid latency. */ local_irq_save(flags); - freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count); + kvm_mips_freeze_hrtimer(vcpu, &start_count); write_c0_gtoffset(start_count - read_c0_count()); local_irq_restore(flags); -- cgit v1.2.3 From 4ac214574d2d83b7ef20c603d75f1937c912bfd6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 6 Sep 2021 05:52:23 -0400 Subject: KVM: MMU: mark role_regs and role accessors as maybe unused It is reasonable for these functions to be used only in some configurations, for example only if the host is 64-bits (and therefore supports 64-bit guests). It is also reasonable to keep the role_regs and role accessors in sync even though some of the accessors may be used only for one of the two sets (as is the case currently for CR4.LA57).. Because clang reports warnings for unused inlines declared in a .c file, mark both sets of accessors as __maybe_unused. Reported-by: kernel test robot Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9b0cdec8b62d..2d7e61122af8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -204,7 +204,7 @@ struct kvm_mmu_role_regs { * the single source of truth for the MMU's state. */ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ -static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ +static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ { \ return !!(regs->reg & flag); \ } @@ -226,7 +226,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); * and the vCPU may be incorrect/irrelevant. */ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ -static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \ +static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ { \ return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ } -- cgit v1.2.3 From d9130a2dfdd4b21736c91b818f87dbc0ccd1e757 Mon Sep 17 00:00:00 2001 From: Zelin Deng Date: Wed, 28 Apr 2021 10:22:01 +0800 Subject: KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted When MSR_IA32_TSC_ADJUST is written by guest due to TSC ADJUST feature especially there's a big tsc warp (like a new vCPU is hot-added into VM which has been up for a long time), tsc_offset is added by a large value then go back to guest. This causes system time jump as tsc_timestamp is not adjusted in the meantime and pvclock monotonic character. To fix this, just notify kvm to update vCPU's guest time before back to guest. Cc: stable@vger.kernel.org Signed-off-by: Zelin Deng Signed-off-by: Paolo Bonzini Message-Id: <1619576521-81399-2-git-send-email-zelin.deng@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1a00af1b076b..28ef14155726 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3321,6 +3321,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } -- cgit v1.2.3