From 9ed24f4b712b855dcf7be3025b75b051cb73a2b7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 13 May 2020 11:40:34 +0100 Subject: KVM: arm64: Move virt/kvm/arm to arch/arm64 Now that the 32bit KVM/arm host is a distant memory, let's move the whole of the KVM/arm64 code into the arm64 tree. As they said in the song: Welcome Home (Sanitarium). Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200513104034.74741-1-maz@kernel.org --- MAINTAINERS | 1 - arch/arm64/kvm/Makefile | 44 +- arch/arm64/kvm/aarch32.c | 204 +++ arch/arm64/kvm/arch_timer.c | 1180 ++++++++++++++ arch/arm64/kvm/arm.c | 1681 ++++++++++++++++++++ arch/arm64/kvm/handle_exit.c | 2 +- arch/arm64/kvm/hyp/Makefile | 9 +- arch/arm64/kvm/hyp/aarch32.c | 140 ++ arch/arm64/kvm/hyp/timer-sr.c | 49 + arch/arm64/kvm/hyp/vgic-v3-sr.c | 1126 +++++++++++++ arch/arm64/kvm/hypercalls.c | 71 + arch/arm64/kvm/mmio.c | 200 +++ arch/arm64/kvm/mmu.c | 2447 +++++++++++++++++++++++++++++ arch/arm64/kvm/perf.c | 57 + arch/arm64/kvm/pmu-emul.c | 869 ++++++++++ arch/arm64/kvm/psci.c | 564 +++++++ arch/arm64/kvm/pvtime.c | 131 ++ arch/arm64/kvm/trace.h | 216 +-- arch/arm64/kvm/trace_arm.h | 378 +++++ arch/arm64/kvm/trace_handle_exit.h | 215 +++ arch/arm64/kvm/vgic-sys-reg-v3.c | 2 +- arch/arm64/kvm/vgic/trace.h | 38 + arch/arm64/kvm/vgic/vgic-debug.c | 300 ++++ arch/arm64/kvm/vgic/vgic-init.c | 556 +++++++ arch/arm64/kvm/vgic/vgic-irqfd.c | 141 ++ arch/arm64/kvm/vgic/vgic-its.c | 2783 +++++++++++++++++++++++++++++++++ arch/arm64/kvm/vgic/vgic-kvm-device.c | 741 +++++++++ arch/arm64/kvm/vgic/vgic-mmio-v2.c | 550 +++++++ arch/arm64/kvm/vgic/vgic-mmio-v3.c | 1063 +++++++++++++ arch/arm64/kvm/vgic/vgic-mmio.c | 1088 +++++++++++++ arch/arm64/kvm/vgic/vgic-mmio.h | 227 +++ arch/arm64/kvm/vgic/vgic-v2.c | 504 ++++++ arch/arm64/kvm/vgic/vgic-v3.c | 691 ++++++++ arch/arm64/kvm/vgic/vgic-v4.c | 453 ++++++ arch/arm64/kvm/vgic/vgic.c | 1011 ++++++++++++ arch/arm64/kvm/vgic/vgic.h | 321 ++++ virt/kvm/arm/aarch32.c | 204 --- virt/kvm/arm/arch_timer.c | 1180 -------------- virt/kvm/arm/arm.c | 1681 -------------------- virt/kvm/arm/hyp/aarch32.c | 140 -- virt/kvm/arm/hyp/timer-sr.c | 49 - virt/kvm/arm/hyp/vgic-v3-sr.c | 1130 ------------- virt/kvm/arm/hypercalls.c | 71 - virt/kvm/arm/mmio.c | 200 --- virt/kvm/arm/mmu.c | 2447 ----------------------------- virt/kvm/arm/perf.c | 57 - virt/kvm/arm/pmu.c | 869 ---------- virt/kvm/arm/psci.c | 564 ------- virt/kvm/arm/pvtime.c | 131 -- virt/kvm/arm/trace.h | 379 ----- virt/kvm/arm/vgic/trace.h | 38 - virt/kvm/arm/vgic/vgic-debug.c | 300 ---- virt/kvm/arm/vgic/vgic-init.c | 556 ------- virt/kvm/arm/vgic/vgic-irqfd.c | 141 -- virt/kvm/arm/vgic/vgic-its.c | 2783 --------------------------------- virt/kvm/arm/vgic/vgic-kvm-device.c | 741 --------- virt/kvm/arm/vgic/vgic-mmio-v2.c | 550 ------- virt/kvm/arm/vgic/vgic-mmio-v3.c | 1063 ------------- virt/kvm/arm/vgic/vgic-mmio.c | 1088 ------------- virt/kvm/arm/vgic/vgic-mmio.h | 227 --- virt/kvm/arm/vgic/vgic-v2.c | 504 ------ virt/kvm/arm/vgic/vgic-v3.c | 693 -------- virt/kvm/arm/vgic/vgic-v4.c | 453 ------ virt/kvm/arm/vgic/vgic.c | 1011 ------------ virt/kvm/arm/vgic/vgic.h | 321 ---- 65 files changed, 19810 insertions(+), 19814 deletions(-) create mode 100644 arch/arm64/kvm/aarch32.c create mode 100644 arch/arm64/kvm/arch_timer.c create mode 100644 arch/arm64/kvm/arm.c create mode 100644 arch/arm64/kvm/hyp/aarch32.c create mode 100644 arch/arm64/kvm/hyp/timer-sr.c create mode 100644 arch/arm64/kvm/hyp/vgic-v3-sr.c create mode 100644 arch/arm64/kvm/hypercalls.c create mode 100644 arch/arm64/kvm/mmio.c create mode 100644 arch/arm64/kvm/mmu.c create mode 100644 arch/arm64/kvm/perf.c create mode 100644 arch/arm64/kvm/pmu-emul.c create mode 100644 arch/arm64/kvm/psci.c create mode 100644 arch/arm64/kvm/pvtime.c create mode 100644 arch/arm64/kvm/trace_arm.h create mode 100644 arch/arm64/kvm/trace_handle_exit.h create mode 100644 arch/arm64/kvm/vgic/trace.h create mode 100644 arch/arm64/kvm/vgic/vgic-debug.c create mode 100644 arch/arm64/kvm/vgic/vgic-init.c create mode 100644 arch/arm64/kvm/vgic/vgic-irqfd.c create mode 100644 arch/arm64/kvm/vgic/vgic-its.c create mode 100644 arch/arm64/kvm/vgic/vgic-kvm-device.c create mode 100644 arch/arm64/kvm/vgic/vgic-mmio-v2.c create mode 100644 arch/arm64/kvm/vgic/vgic-mmio-v3.c create mode 100644 arch/arm64/kvm/vgic/vgic-mmio.c create mode 100644 arch/arm64/kvm/vgic/vgic-mmio.h create mode 100644 arch/arm64/kvm/vgic/vgic-v2.c create mode 100644 arch/arm64/kvm/vgic/vgic-v3.c create mode 100644 arch/arm64/kvm/vgic/vgic-v4.c create mode 100644 arch/arm64/kvm/vgic/vgic.c create mode 100644 arch/arm64/kvm/vgic/vgic.h delete mode 100644 virt/kvm/arm/aarch32.c delete mode 100644 virt/kvm/arm/arch_timer.c delete mode 100644 virt/kvm/arm/arm.c delete mode 100644 virt/kvm/arm/hyp/aarch32.c delete mode 100644 virt/kvm/arm/hyp/timer-sr.c delete mode 100644 virt/kvm/arm/hyp/vgic-v3-sr.c delete mode 100644 virt/kvm/arm/hypercalls.c delete mode 100644 virt/kvm/arm/mmio.c delete mode 100644 virt/kvm/arm/mmu.c delete mode 100644 virt/kvm/arm/perf.c delete mode 100644 virt/kvm/arm/pmu.c delete mode 100644 virt/kvm/arm/psci.c delete mode 100644 virt/kvm/arm/pvtime.c delete mode 100644 virt/kvm/arm/trace.h delete mode 100644 virt/kvm/arm/vgic/trace.h delete mode 100644 virt/kvm/arm/vgic/vgic-debug.c delete mode 100644 virt/kvm/arm/vgic/vgic-init.c delete mode 100644 virt/kvm/arm/vgic/vgic-irqfd.c delete mode 100644 virt/kvm/arm/vgic/vgic-its.c delete mode 100644 virt/kvm/arm/vgic/vgic-kvm-device.c delete mode 100644 virt/kvm/arm/vgic/vgic-mmio-v2.c delete mode 100644 virt/kvm/arm/vgic/vgic-mmio-v3.c delete mode 100644 virt/kvm/arm/vgic/vgic-mmio.c delete mode 100644 virt/kvm/arm/vgic/vgic-mmio.h delete mode 100644 virt/kvm/arm/vgic/vgic-v2.c delete mode 100644 virt/kvm/arm/vgic/vgic-v3.c delete mode 100644 virt/kvm/arm/vgic/vgic-v4.c delete mode 100644 virt/kvm/arm/vgic/vgic.c delete mode 100644 virt/kvm/arm/vgic/vgic.h diff --git a/MAINTAINERS b/MAINTAINERS index 091ec22c1a23..6c5b928989ed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9295,7 +9295,6 @@ F: arch/arm64/include/asm/kvm* F: arch/arm64/include/uapi/asm/kvm* F: arch/arm64/kvm/ F: include/kvm/arm_* -F: virt/kvm/arm/ KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips) L: linux-mips@vger.kernel.org diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 5ffbdc39e780..7a3768538343 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -3,37 +3,37 @@ # Makefile for Kernel-based Virtual Machine module # -ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic +ccflags-y += -I $(srctree)/$(src) KVM=../../../virt/kvm obj-$(CONFIG_KVM_ARM_HOST) += kvm.o obj-$(CONFIG_KVM_ARM_HOST) += hyp/ -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hypercalls.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/pvtime.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/eventfd.o $(KVM)/vfio.o $(KVM)/irqchip.o +kvm-$(CONFIG_KVM_ARM_HOST) += arm.o mmu.o mmio.o +kvm-$(CONFIG_KVM_ARM_HOST) += psci.o perf.o +kvm-$(CONFIG_KVM_ARM_HOST) += hypercalls.o +kvm-$(CONFIG_KVM_ARM_HOST) += pvtime.o kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o pmu.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o +kvm-$(CONFIG_KVM_ARM_HOST) += aarch32.o +kvm-$(CONFIG_KVM_ARM_HOST) += arch_timer.o +kvm-$(CONFIG_KVM_ARM_PMU) += pmu-emul.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v2.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v3.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v4.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-debug.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/irqchip.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o -kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-init.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-irqfd.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v2.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v3.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v4.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v2.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v3.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-kvm-device.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-its.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-debug.o diff --git a/arch/arm64/kvm/aarch32.c b/arch/arm64/kvm/aarch32.c new file mode 100644 index 000000000000..0a356aa91aa1 --- /dev/null +++ b/arch/arm64/kvm/aarch32.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (not much of an) Emulation layer for 32bit guests. + * + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier + * + * based on arch/arm/kvm/emulate.c + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall + */ + +#include +#include +#include +#include + +#define DFSR_FSC_EXTABT_LPAE 0x10 +#define DFSR_FSC_EXTABT_nLPAE 0x08 +#define DFSR_LPAE BIT(9) + +/* + * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. + */ +static const u8 return_offsets[8][2] = { + [0] = { 0, 0 }, /* Reset, unused */ + [1] = { 4, 2 }, /* Undefined */ + [2] = { 0, 0 }, /* SVC, unused */ + [3] = { 4, 4 }, /* Prefetch abort */ + [4] = { 8, 8 }, /* Data abort */ + [5] = { 0, 0 }, /* HVC, unused */ + [6] = { 4, 4 }, /* IRQ, unused */ + [7] = { 4, 4 }, /* FIQ, unused */ +}; + +/* + * When an exception is taken, most CPSR fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). + * + * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with + * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was + * obsoleted by the ARMv7 virtualization extensions and is RES0. + * + * For the SPSR layout seen from AArch32, see: + * - ARM DDI 0406C.d, page B1-1148 + * - ARM DDI 0487E.a, page G8-6264 + * + * For the SPSR_ELx layout for AArch32 seen from AArch64, see: + * - ARM DDI 0487E.a, page C5-426 + * + * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from + * MSB to LSB. + */ +static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) +{ + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); + unsigned long old, new; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_AA32_N_BIT); + new |= (old & PSR_AA32_Z_BIT); + new |= (old & PSR_AA32_C_BIT); + new |= (old & PSR_AA32_V_BIT); + new |= (old & PSR_AA32_Q_BIT); + + // CPSR.IT[7:0] are set to zero upon any exception + // See ARM DDI 0487E.a, section G1.12.3 + // See ARM DDI 0406C.d, section B1.8.3 + + new |= (old & PSR_AA32_DIT_BIT); + + // CPSR.SSBS is set to SCTLR.DSSBS upon any exception + // See ARM DDI 0487E.a, page G8-6244 + if (sctlr & BIT(31)) + new |= PSR_AA32_SSBS_BIT; + + // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 + // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page G8-6246 + new |= (old & PSR_AA32_PAN_BIT); + if (!(sctlr & BIT(23))) + new |= PSR_AA32_PAN_BIT; + + // SS does not exist in AArch32, so ignore + + // CPSR.IL is set to zero upon any exception + // See ARM DDI 0487E.a, page G1-5527 + + new |= (old & PSR_AA32_GE_MASK); + + // CPSR.IT[7:0] are set to zero upon any exception + // See prior comment above + + // CPSR.E is set to SCTLR.EE upon any exception + // See ARM DDI 0487E.a, page G8-6245 + // See ARM DDI 0406C.d, page B4-1701 + if (sctlr & BIT(25)) + new |= PSR_AA32_E_BIT; + + // CPSR.A is unchanged upon an exception to Undefined, Supervisor + // CPSR.A is set upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_A_BIT); + if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) + new |= PSR_AA32_A_BIT; + + // CPSR.I is set upon any exception + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= PSR_AA32_I_BIT; + + // CPSR.F is set upon an exception to FIQ + // CPSR.F is unchanged upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_F_BIT); + if (mode == PSR_AA32_MODE_FIQ) + new |= PSR_AA32_F_BIT; + + // CPSR.T is set to SCTLR.TE upon any exception + // See ARM DDI 0487E.a, page G8-5514 + // See ARM DDI 0406C.d, page B1-1181 + if (sctlr & BIT(30)) + new |= PSR_AA32_T_BIT; + + new |= mode; + + return new; +} + +static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) +{ + unsigned long spsr = *vcpu_cpsr(vcpu); + bool is_thumb = (spsr & PSR_AA32_T_BIT); + u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); + + *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); + + /* Note: These now point to the banked copies */ + vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr)); + *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; + + /* Branch to exception vector */ + if (sctlr & (1 << 13)) + vect_offset += 0xffff0000; + else /* always have security exceptions */ + vect_offset += vcpu_cp15(vcpu, c12_VBAR); + + *vcpu_pc(vcpu) = vect_offset; +} + +void kvm_inject_undef32(struct kvm_vcpu *vcpu) +{ + prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4); +} + +/* + * Modelled after TakeDataAbortException() and TakePrefetchAbortException + * pseudocode. + */ +static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, + unsigned long addr) +{ + u32 vect_offset; + u32 *far, *fsr; + bool is_lpae; + + if (is_pabt) { + vect_offset = 12; + far = &vcpu_cp15(vcpu, c6_IFAR); + fsr = &vcpu_cp15(vcpu, c5_IFSR); + } else { /* !iabt */ + vect_offset = 16; + far = &vcpu_cp15(vcpu, c6_DFAR); + fsr = &vcpu_cp15(vcpu, c5_DFSR); + } + + prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset); + + *far = addr; + + /* Give the guest an IMPLEMENTATION DEFINED exception */ + is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); + if (is_lpae) { + *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; + } else { + /* no need to shuffle FS[4] into DFSR[10] as its 0 */ + *fsr = DFSR_FSC_EXTABT_nLPAE; + } +} + +void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr) +{ + inject_abt32(vcpu, false, addr); +} + +void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr) +{ + inject_abt32(vcpu, true, addr); +} diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c new file mode 100644 index 000000000000..93bd59b46848 --- /dev/null +++ b/arch/arm64/kvm/arch_timer.c @@ -0,0 +1,1180 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "trace.h" + +static struct timecounter *timecounter; +static unsigned int host_vtimer_irq; +static unsigned int host_ptimer_irq; +static u32 host_vtimer_irq_flags; +static u32 host_ptimer_irq_flags; + +static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); + +static const struct kvm_irq_level default_ptimer_irq = { + .irq = 30, + .level = 1, +}; + +static const struct kvm_irq_level default_vtimer_irq = { + .irq = 27, + .level = 1, +}; + +static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, + struct arch_timer_context *timer_ctx); +static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val); +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg); + +u64 kvm_phys_timer_read(void) +{ + return timecounter->cc->read(timecounter->cc); +} + +static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) +{ + if (has_vhe()) { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_ptimer = NULL; + } else { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = NULL; + map->emul_ptimer = vcpu_ptimer(vcpu); + } + + trace_kvm_get_timer_map(vcpu->vcpu_id, map); +} + +static inline bool userspace_irqchip(struct kvm *kvm) +{ + return static_branch_unlikely(&userspace_irqchip_in_use) && + unlikely(!irqchip_in_kernel(kvm)); +} + +static void soft_timer_start(struct hrtimer *hrt, u64 ns) +{ + hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), + HRTIMER_MODE_ABS_HARD); +} + +static void soft_timer_cancel(struct hrtimer *hrt) +{ + hrtimer_cancel(hrt); +} + +static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) +{ + struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; + struct arch_timer_context *ctx; + struct timer_map map; + + /* + * We may see a timer interrupt after vcpu_put() has been called which + * sets the CPU's vcpu pointer to NULL, because even though the timer + * has been disabled in timer_save_state(), the hardware interrupt + * signal may not have been retired from the interrupt controller yet. + */ + if (!vcpu) + return IRQ_HANDLED; + + get_timer_map(vcpu, &map); + + if (irq == host_vtimer_irq) + ctx = map.direct_vtimer; + else + ctx = map.direct_ptimer; + + if (kvm_timer_should_fire(ctx)) + kvm_timer_update_irq(vcpu, true, ctx); + + if (userspace_irqchip(vcpu->kvm) && + !static_branch_unlikely(&has_gic_active_state)) + disable_percpu_irq(host_vtimer_irq); + + return IRQ_HANDLED; +} + +static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) +{ + u64 cval, now; + + cval = timer_ctx->cnt_cval; + now = kvm_phys_timer_read() - timer_ctx->cntvoff; + + if (now < cval) { + u64 ns; + + ns = cyclecounter_cyc2ns(timecounter->cc, + cval - now, + timecounter->mask, + &timecounter->frac); + return ns; + } + + return 0; +} + +static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) +{ + WARN_ON(timer_ctx && timer_ctx->loaded); + return timer_ctx && + !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && + (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); +} + +/* + * Returns the earliest expiration time in ns among guest timers. + * Note that it will return 0 if none of timers can fire. + */ +static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) +{ + u64 min_delta = ULLONG_MAX; + int i; + + for (i = 0; i < NR_KVM_TIMERS; i++) { + struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; + + WARN(ctx->loaded, "timer %d loaded\n", i); + if (kvm_timer_irq_can_fire(ctx)) + min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); + } + + /* If none of timers can fire, then return 0 */ + if (min_delta == ULLONG_MAX) + return 0; + + return min_delta; +} + +static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) +{ + struct arch_timer_cpu *timer; + struct kvm_vcpu *vcpu; + u64 ns; + + timer = container_of(hrt, struct arch_timer_cpu, bg_timer); + vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); + + /* + * Check that the timer has really expired from the guest's + * PoV (NTP on the host may have forced it to expire + * early). If we should have slept longer, restart it. + */ + ns = kvm_timer_earliest_exp(vcpu); + if (unlikely(ns)) { + hrtimer_forward_now(hrt, ns_to_ktime(ns)); + return HRTIMER_RESTART; + } + + kvm_vcpu_wake_up(vcpu); + return HRTIMER_NORESTART; +} + +static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) +{ + struct arch_timer_context *ctx; + struct kvm_vcpu *vcpu; + u64 ns; + + ctx = container_of(hrt, struct arch_timer_context, hrtimer); + vcpu = ctx->vcpu; + + trace_kvm_timer_hrtimer_expire(ctx); + + /* + * Check that the timer has really expired from the guest's + * PoV (NTP on the host may have forced it to expire + * early). If not ready, schedule for a later time. + */ + ns = kvm_timer_compute_delta(ctx); + if (unlikely(ns)) { + hrtimer_forward_now(hrt, ns_to_ktime(ns)); + return HRTIMER_RESTART; + } + + kvm_timer_update_irq(vcpu, true, ctx); + return HRTIMER_NORESTART; +} + +static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) +{ + enum kvm_arch_timers index; + u64 cval, now; + + if (!timer_ctx) + return false; + + index = arch_timer_ctx_index(timer_ctx); + + if (timer_ctx->loaded) { + u32 cnt_ctl = 0; + + switch (index) { + case TIMER_VTIMER: + cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); + break; + case TIMER_PTIMER: + cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); + break; + case NR_KVM_TIMERS: + /* GCC is braindead */ + cnt_ctl = 0; + break; + } + + return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && + (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && + !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); + } + + if (!kvm_timer_irq_can_fire(timer_ctx)) + return false; + + cval = timer_ctx->cnt_cval; + now = kvm_phys_timer_read() - timer_ctx->cntvoff; + + return cval <= now; +} + +bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) +{ + struct timer_map map; + + get_timer_map(vcpu, &map); + + return kvm_timer_should_fire(map.direct_vtimer) || + kvm_timer_should_fire(map.direct_ptimer) || + kvm_timer_should_fire(map.emul_ptimer); +} + +/* + * Reflect the timer output level into the kvm_run structure + */ +void kvm_timer_update_run(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Populate the device bitmap with the timer states */ + regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | + KVM_ARM_DEV_EL1_PTIMER); + if (kvm_timer_should_fire(vtimer)) + regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; + if (kvm_timer_should_fire(ptimer)) + regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; +} + +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, + struct arch_timer_context *timer_ctx) +{ + int ret; + + timer_ctx->irq.level = new_level; + trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, + timer_ctx->irq.level); + + if (!userspace_irqchip(vcpu->kvm)) { + ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + timer_ctx->irq.irq, + timer_ctx->irq.level, + timer_ctx); + WARN_ON(ret); + } +} + +/* Only called for a fully emulated timer */ +static void timer_emulate(struct arch_timer_context *ctx) +{ + bool should_fire = kvm_timer_should_fire(ctx); + + trace_kvm_timer_emulate(ctx, should_fire); + + if (should_fire != ctx->irq.level) { + kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); + return; + } + + /* + * If the timer can fire now, we don't need to have a soft timer + * scheduled for the future. If the timer cannot fire at all, + * then we also don't need a soft timer. + */ + if (!kvm_timer_irq_can_fire(ctx)) { + soft_timer_cancel(&ctx->hrtimer); + return; + } + + soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); +} + +static void timer_save_state(struct arch_timer_context *ctx) +{ + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); + unsigned long flags; + + if (!timer->enabled) + return; + + local_irq_save(flags); + + if (!ctx->loaded) + goto out; + + switch (index) { + case TIMER_VTIMER: + ctx->cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); + ctx->cnt_cval = read_sysreg_el0(SYS_CNTV_CVAL); + + /* Disable the timer */ + write_sysreg_el0(0, SYS_CNTV_CTL); + isb(); + + break; + case TIMER_PTIMER: + ctx->cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); + ctx->cnt_cval = read_sysreg_el0(SYS_CNTP_CVAL); + + /* Disable the timer */ + write_sysreg_el0(0, SYS_CNTP_CTL); + isb(); + + break; + case NR_KVM_TIMERS: + BUG(); + } + + trace_kvm_timer_save_state(ctx); + + ctx->loaded = false; +out: + local_irq_restore(flags); +} + +/* + * Schedule the background timer before calling kvm_vcpu_block, so that this + * thread is removed from its waitqueue and made runnable when there's a timer + * interrupt to handle. + */ +static void kvm_timer_blocking(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * If no timers are capable of raising interrupts (disabled or + * masked), then there's no more work for us to do. + */ + if (!kvm_timer_irq_can_fire(map.direct_vtimer) && + !kvm_timer_irq_can_fire(map.direct_ptimer) && + !kvm_timer_irq_can_fire(map.emul_ptimer)) + return; + + /* + * At least one guest time will expire. Schedule a background timer. + * Set the earliest expiration time among the guest timers. + */ + soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); +} + +static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + soft_timer_cancel(&timer->bg_timer); +} + +static void timer_restore_state(struct arch_timer_context *ctx) +{ + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); + unsigned long flags; + + if (!timer->enabled) + return; + + local_irq_save(flags); + + if (ctx->loaded) + goto out; + + switch (index) { + case TIMER_VTIMER: + write_sysreg_el0(ctx->cnt_cval, SYS_CNTV_CVAL); + isb(); + write_sysreg_el0(ctx->cnt_ctl, SYS_CNTV_CTL); + break; + case TIMER_PTIMER: + write_sysreg_el0(ctx->cnt_cval, SYS_CNTP_CVAL); + isb(); + write_sysreg_el0(ctx->cnt_ctl, SYS_CNTP_CTL); + break; + case NR_KVM_TIMERS: + BUG(); + } + + trace_kvm_timer_restore_state(ctx); + + ctx->loaded = true; +out: + local_irq_restore(flags); +} + +static void set_cntvoff(u64 cntvoff) +{ + u32 low = lower_32_bits(cntvoff); + u32 high = upper_32_bits(cntvoff); + + /* + * Since kvm_call_hyp doesn't fully support the ARM PCS especially on + * 32-bit systems, but rather passes register by register shifted one + * place (we put the function address in r0/x0), we cannot simply pass + * a 64-bit value as an argument, but have to split the value in two + * 32-bit halves. + */ + kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); +} + +static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) +{ + int r; + r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active); + WARN_ON(r); +} + +static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) +{ + struct kvm_vcpu *vcpu = ctx->vcpu; + bool phys_active = false; + + /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); + + if (irqchip_in_kernel(vcpu->kvm)) + phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); + + phys_active |= ctx->irq.level; + + set_timer_irq_phys_active(ctx, phys_active); +} + +static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer); + + /* + * When using a userspace irqchip with the architected timers and a + * host interrupt controller that doesn't support an active state, we + * must still prevent continuously exiting from the guest, and + * therefore mask the physical interrupt by disabling it on the host + * interrupt controller when the virtual level is high, such that the + * guest can make forward progress. Once we detect the output level + * being de-asserted, we unmask the interrupt again so that we exit + * from the guest when the timer fires. + */ + if (vtimer->irq.level) + disable_percpu_irq(host_vtimer_irq); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); +} + +void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + if (unlikely(!timer->enabled)) + return; + + get_timer_map(vcpu, &map); + + if (static_branch_likely(&has_gic_active_state)) { + kvm_timer_vcpu_load_gic(map.direct_vtimer); + if (map.direct_ptimer) + kvm_timer_vcpu_load_gic(map.direct_ptimer); + } else { + kvm_timer_vcpu_load_nogic(vcpu); + } + + set_cntvoff(map.direct_vtimer->cntvoff); + + kvm_timer_unblocking(vcpu); + + timer_restore_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_restore_state(map.direct_ptimer); + + if (map.emul_ptimer) + timer_emulate(map.emul_ptimer); +} + +bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool vlevel, plevel; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; + plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; + + return kvm_timer_should_fire(vtimer) != vlevel || + kvm_timer_should_fire(ptimer) != plevel; +} + +void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + if (unlikely(!timer->enabled)) + return; + + get_timer_map(vcpu, &map); + + timer_save_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_save_state(map.direct_ptimer); + + /* + * Cancel soft timer emulation, because the only case where we + * need it after a vcpu_put is in the context of a sleeping VCPU, and + * in that case we already factor in the deadline for the physical + * timer when scheduling the bg_timer. + * + * In any case, we re-schedule the hrtimer for the physical timer when + * coming back to the VCPU thread in kvm_timer_vcpu_load(). + */ + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); + + if (swait_active(kvm_arch_vcpu_wq(vcpu))) + kvm_timer_blocking(vcpu); + + /* + * The kernel may decide to run userspace after calling vcpu_put, so + * we reset cntvoff to 0 to ensure a consistent read between user + * accesses to the virtual counter and kernel access to the physical + * counter of non-VHE case. For VHE, the virtual counter uses a fixed + * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. + */ + set_cntvoff(0); +} + +/* + * With a userspace irqchip we have to check if the guest de-asserted the + * timer and if so, unmask the timer irq signal on the host interrupt + * controller to ensure that we see future timer signals. + */ +static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + if (!kvm_timer_should_fire(vtimer)) { + kvm_timer_update_irq(vcpu, false, vtimer); + if (static_branch_likely(&has_gic_active_state)) + set_timer_irq_phys_active(vtimer, false); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + } +} + +void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + if (unlikely(!timer->enabled)) + return; + + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + unmask_vtimer_irq_user(vcpu); +} + +int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 + * and to 0 for ARMv7. We provide an implementation that always + * resets the timer to be disabled and unmasked and is compliant with + * the ARMv7 architecture. + */ + vcpu_vtimer(vcpu)->cnt_ctl = 0; + vcpu_ptimer(vcpu)->cnt_ctl = 0; + + if (timer->enabled) { + kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu)); + kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu)); + + if (irqchip_in_kernel(vcpu->kvm)) { + kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq); + if (map.direct_ptimer) + kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq); + } + } + + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); + + return 0; +} + +/* Make the updates of cntvoff for all vtimer contexts atomic */ +static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) +{ + int i; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tmp; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(i, tmp, kvm) + vcpu_vtimer(tmp)->cntvoff = cntvoff; + + /* + * When called from the vcpu create path, the CPU being created is not + * included in the loop above, so we just set it here as well. + */ + vcpu_vtimer(vcpu)->cntvoff = cntvoff; + mutex_unlock(&kvm->lock); +} + +void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + + /* Synchronize cntvoff across all vtimers of a VM. */ + update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); + ptimer->cntvoff = 0; + + hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + timer->bg_timer.function = kvm_bg_timer_expire; + + hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + vtimer->hrtimer.function = kvm_hrtimer_expire; + ptimer->hrtimer.function = kvm_hrtimer_expire; + + vtimer->irq.irq = default_vtimer_irq.irq; + ptimer->irq.irq = default_ptimer_irq.irq; + + vtimer->host_timer_irq = host_vtimer_irq; + ptimer->host_timer_irq = host_ptimer_irq; + + vtimer->host_timer_irq_flags = host_vtimer_irq_flags; + ptimer->host_timer_irq_flags = host_ptimer_irq_flags; + + vtimer->vcpu = vcpu; + ptimer->vcpu = vcpu; +} + +static void kvm_timer_init_interrupt(void *info) +{ + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); +} + +int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) +{ + struct arch_timer_context *timer; + + switch (regid) { + case KVM_REG_ARM_TIMER_CTL: + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); + break; + case KVM_REG_ARM_TIMER_CNT: + timer = vcpu_vtimer(vcpu); + update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); + break; + case KVM_REG_ARM_TIMER_CVAL: + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); + break; + case KVM_REG_ARM_PTIMER_CTL: + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); + break; + case KVM_REG_ARM_PTIMER_CVAL: + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); + break; + + default: + return -1; + } + + return 0; +} + +static u64 read_timer_ctl(struct arch_timer_context *timer) +{ + /* + * Set ISTATUS bit if it's expired. + * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is + * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit + * regardless of ENABLE bit for our implementation convenience. + */ + if (!kvm_timer_compute_delta(timer)) + return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT; + else + return timer->cnt_ctl; +} + +u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) +{ + switch (regid) { + case KVM_REG_ARM_TIMER_CTL: + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CTL); + case KVM_REG_ARM_TIMER_CNT: + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CNT); + case KVM_REG_ARM_TIMER_CVAL: + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CVAL); + case KVM_REG_ARM_PTIMER_CTL: + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CTL); + case KVM_REG_ARM_PTIMER_CNT: + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CNT); + case KVM_REG_ARM_PTIMER_CVAL: + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CVAL); + } + return (u64)-1; +} + +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + switch (treg) { + case TIMER_REG_TVAL: + val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff; + val &= lower_32_bits(val); + break; + + case TIMER_REG_CTL: + val = read_timer_ctl(timer); + break; + + case TIMER_REG_CVAL: + val = timer->cnt_cval; + break; + + case TIMER_REG_CNT: + val = kvm_phys_timer_read() - timer->cntvoff; + break; + + default: + BUG(); + } + + return val; +} + +u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); + + return val; +} + +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val) +{ + switch (treg) { + case TIMER_REG_TVAL: + timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val; + break; + + case TIMER_REG_CTL: + timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT; + break; + + case TIMER_REG_CVAL: + timer->cnt_cval = val; + break; + + default: + BUG(); + } +} + +void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg, + u64 val) +{ + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); +} + +static int kvm_timer_starting_cpu(unsigned int cpu) +{ + kvm_timer_init_interrupt(NULL); + return 0; +} + +static int kvm_timer_dying_cpu(unsigned int cpu) +{ + disable_percpu_irq(host_vtimer_irq); + return 0; +} + +int kvm_timer_hyp_init(bool has_gic) +{ + struct arch_timer_kvm_info *info; + int err; + + info = arch_timer_get_kvm_info(); + timecounter = &info->timecounter; + + if (!timecounter->cc) { + kvm_err("kvm_arch_timer: uninitialized timecounter\n"); + return -ENODEV; + } + + /* First, do the virtual EL1 timer irq */ + + if (info->virtual_irq <= 0) { + kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", + info->virtual_irq); + return -ENODEV; + } + host_vtimer_irq = info->virtual_irq; + + host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); + if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && + host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { + kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n", + host_vtimer_irq); + host_vtimer_irq_flags = IRQF_TRIGGER_LOW; + } + + err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, + "kvm guest vtimer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n", + host_vtimer_irq, err); + return err; + } + + if (has_gic) { + err = irq_set_vcpu_affinity(host_vtimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_irq; + } + + static_branch_enable(&has_gic_active_state); + } + + kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); + + /* Now let's do the physical EL1 timer irq */ + + if (info->physical_irq > 0) { + host_ptimer_irq = info->physical_irq; + host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq); + if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH && + host_ptimer_irq_flags != IRQF_TRIGGER_LOW) { + kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n", + host_ptimer_irq); + host_ptimer_irq_flags = IRQF_TRIGGER_LOW; + } + + err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler, + "kvm guest ptimer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", + host_ptimer_irq, err); + return err; + } + + if (has_gic) { + err = irq_set_vcpu_affinity(host_ptimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_irq; + } + } + + kvm_debug("physical timer IRQ%d\n", host_ptimer_irq); + } else if (has_vhe()) { + kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", + info->physical_irq); + err = -ENODEV; + goto out_free_irq; + } + + cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, + "kvm/arm/timer:starting", kvm_timer_starting_cpu, + kvm_timer_dying_cpu); + return 0; +out_free_irq: + free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); + return err; +} + +void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + soft_timer_cancel(&timer->bg_timer); +} + +static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) +{ + int vtimer_irq, ptimer_irq; + int i, ret; + + vtimer_irq = vcpu_vtimer(vcpu)->irq.irq; + ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu)); + if (ret) + return false; + + ptimer_irq = vcpu_ptimer(vcpu)->irq.irq; + ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu)); + if (ret) + return false; + + kvm_for_each_vcpu(i, vcpu, vcpu->kvm) { + if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq || + vcpu_ptimer(vcpu)->irq.irq != ptimer_irq) + return false; + } + + return true; +} + +bool kvm_arch_timer_get_input_level(int vintid) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + struct arch_timer_context *timer; + + if (vintid == vcpu_vtimer(vcpu)->irq.irq) + timer = vcpu_vtimer(vcpu); + else if (vintid == vcpu_ptimer(vcpu)->irq.irq) + timer = vcpu_ptimer(vcpu); + else + BUG(); + + return kvm_timer_should_fire(timer); +} + +int kvm_timer_enable(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + int ret; + + if (timer->enabled) + return 0; + + /* Without a VGIC we do not map virtual IRQs to physical IRQs */ + if (!irqchip_in_kernel(vcpu->kvm)) + goto no_vgic; + + if (!vgic_initialized(vcpu->kvm)) + return -ENODEV; + + if (!timer_irqs_are_valid(vcpu)) { + kvm_debug("incorrectly configured timer irqs\n"); + return -EINVAL; + } + + get_timer_map(vcpu, &map); + + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_vtimer->host_timer_irq, + map.direct_vtimer->irq.irq, + kvm_arch_timer_get_input_level); + if (ret) + return ret; + + if (map.direct_ptimer) { + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_ptimer->host_timer_irq, + map.direct_ptimer->irq.irq, + kvm_arch_timer_get_input_level); + } + + if (ret) + return ret; + +no_vgic: + timer->enabled = 1; + return 0; +} + +/* + * On VHE system, we only need to configure the EL2 timer trap register once, + * not for every world switch. + * The host kernel runs at EL2 with HCR_EL2.TGE == 1, + * and this makes those bits have no effect for the host kernel execution. + */ +void kvm_timer_init_vhe(void) +{ + /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */ + u32 cnthctl_shift = 10; + u64 val; + + /* + * VHE systems allow the guest direct access to the EL1 physical + * timer/counter. + */ + val = read_sysreg(cnthctl_el2); + val |= (CNTHCTL_EL1PCEN << cnthctl_shift); + val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); + write_sysreg(val, cnthctl_el2); +} + +static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu_vtimer(vcpu)->irq.irq = vtimer_irq; + vcpu_ptimer(vcpu)->irq.irq = ptimer_irq; + } +} + +int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + int __user *uaddr = (int __user *)(long)attr->addr; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + int irq; + + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; + + if (get_user(irq, uaddr)) + return -EFAULT; + + if (!(irq_is_ppi(irq))) + return -EINVAL; + + if (vcpu->arch.timer_cpu.enabled) + return -EBUSY; + + switch (attr->attr) { + case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: + set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq); + break; + case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq); + break; + default: + return -ENXIO; + } + + return 0; +} + +int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + int __user *uaddr = (int __user *)(long)attr->addr; + struct arch_timer_context *timer; + int irq; + + switch (attr->attr) { + case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: + timer = vcpu_vtimer(vcpu); + break; + case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + timer = vcpu_ptimer(vcpu); + break; + default: + return -ENXIO; + } + + irq = timer->irq.irq; + return put_user(irq, uaddr); +} + +int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: + case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + return 0; + } + + return -ENXIO; +} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c new file mode 100644 index 000000000000..c958bb37b769 --- /dev/null +++ b/arch/arm64/kvm/arm.c @@ -0,0 +1,1681 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "trace_arm.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef REQUIRES_VIRT +__asm__(".arch_extension virt"); +#endif + +DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); +static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); + +/* The VMID used in the VTTBR */ +static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); +static u32 kvm_next_vmid; +static DEFINE_SPINLOCK(kvm_vmid_lock); + +static bool vgic_present; + +static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); +DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; +} + +int kvm_arch_hardware_setup(void *opaque) +{ + return 0; +} + +int kvm_arch_check_processor_compat(void *opaque) +{ + return 0; +} + +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_ARM_NISV_TO_USER: + r = 0; + kvm->arch.return_nisv_io_abort_to_user = true; + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +/** + * kvm_arch_init_vm - initializes a VM data structure + * @kvm: pointer to the KVM struct + */ +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + int ret, cpu; + + ret = kvm_arm_setup_stage2(kvm, type); + if (ret) + return ret; + + kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran)); + if (!kvm->arch.last_vcpu_ran) + return -ENOMEM; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1; + + ret = kvm_alloc_stage2_pgd(kvm); + if (ret) + goto out_fail_alloc; + + ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); + if (ret) + goto out_free_stage2_pgd; + + kvm_vgic_early_init(kvm); + + /* Mark the initial VMID generation invalid */ + kvm->arch.vmid.vmid_gen = 0; + + /* The maximum number of VCPUs is limited by the host's GIC model */ + kvm->arch.max_vcpus = vgic_present ? + kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; + + return ret; +out_free_stage2_pgd: + kvm_free_stage2_pgd(kvm); +out_fail_alloc: + free_percpu(kvm->arch.last_vcpu_ran); + kvm->arch.last_vcpu_ran = NULL; + return ret; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + +vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + + +/** + * kvm_arch_destroy_vm - destroy the VM data structure + * @kvm: pointer to the KVM struct + */ +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + int i; + + kvm_vgic_destroy(kvm); + + free_percpu(kvm->arch.last_vcpu_ran); + kvm->arch.last_vcpu_ran = NULL; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (kvm->vcpus[i]) { + kvm_vcpu_destroy(kvm->vcpus[i]); + kvm->vcpus[i] = NULL; + } + } + atomic_set(&kvm->online_vcpus, 0); +} + +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +{ + int r; + switch (ext) { + case KVM_CAP_IRQCHIP: + r = vgic_present; + break; + case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: + case KVM_CAP_USER_MEMORY: + case KVM_CAP_SYNC_MMU: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + case KVM_CAP_ONE_REG: + case KVM_CAP_ARM_PSCI: + case KVM_CAP_ARM_PSCI_0_2: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_MP_STATE: + case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: + case KVM_CAP_ARM_NISV_TO_USER: + case KVM_CAP_ARM_INJECT_EXT_DABT: + r = 1; + break; + case KVM_CAP_ARM_SET_DEVICE_ADDR: + r = 1; + break; + case KVM_CAP_NR_VCPUS: + r = num_online_cpus(); + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_ID; + break; + case KVM_CAP_MSI_DEVID: + if (!kvm) + r = -EINVAL; + else + r = kvm->arch.vgic.msis_require_devid; + break; + case KVM_CAP_ARM_USER_IRQ: + /* + * 1: EL1_VTIMER, EL1_PTIMER, and PMU. + * (bump this number if adding more devices) + */ + r = 1; + break; + default: + r = kvm_arch_vm_ioctl_check_extension(kvm, ext); + break; + } + return r; +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -EINVAL; +} + +struct kvm *kvm_arch_alloc_vm(void) +{ + if (!has_vhe()) + return kzalloc(sizeof(struct kvm), GFP_KERNEL); + + return vzalloc(sizeof(struct kvm)); +} + +void kvm_arch_free_vm(struct kvm *kvm) +{ + if (!has_vhe()) + kfree(kvm); + else + vfree(kvm); +} + +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) + return -EBUSY; + + if (id >= kvm->arch.max_vcpus) + return -EINVAL; + + return 0; +} + +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + int err; + + /* Force users to call KVM_ARM_VCPU_INIT */ + vcpu->arch.target = -1; + bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + + /* Set up the timer */ + kvm_timer_vcpu_init(vcpu); + + kvm_pmu_vcpu_init(vcpu); + + kvm_arm_reset_debug_ptr(vcpu); + + kvm_arm_pvtime_vcpu_init(&vcpu->arch); + + err = kvm_vgic_vcpu_init(vcpu); + if (err) + return err; + + return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); +} + +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) + static_branch_dec(&userspace_irqchip_in_use); + + kvm_mmu_free_memory_caches(vcpu); + kvm_timer_vcpu_terminate(vcpu); + kvm_pmu_vcpu_destroy(vcpu); + + kvm_arm_vcpu_destroy(vcpu); +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return kvm_timer_is_pending(vcpu); +} + +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + /* + * If we're about to block (most likely because we've just hit a + * WFI), we need to sync back the state of the GIC CPU interface + * so that we have the latest PMR and group enables. This ensures + * that kvm_arch_vcpu_runnable has up-to-date data to decide + * whether we have pending interrupts. + * + * For the same reason, we want to tell GICv4 that we need + * doorbells to be signalled, should an interrupt become pending. + */ + preempt_disable(); + kvm_vgic_vmcr_sync(vcpu); + vgic_v4_put(vcpu, true); + preempt_enable(); +} + +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + vgic_v4_load(vcpu); + preempt_enable(); +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + int *last_ran; + kvm_host_data_t *cpu_data; + + last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran); + cpu_data = this_cpu_ptr(&kvm_host_data); + + /* + * We might get preempted before the vCPU actually runs, but + * over-invalidation doesn't affect correctness. + */ + if (*last_ran != vcpu->vcpu_id) { + kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu); + *last_ran = vcpu->vcpu_id; + } + + vcpu->cpu = cpu; + vcpu->arch.host_cpu_context = &cpu_data->host_ctxt; + + kvm_vgic_load(vcpu); + kvm_timer_vcpu_load(vcpu); + kvm_vcpu_load_sysregs(vcpu); + kvm_arch_vcpu_load_fp(vcpu); + kvm_vcpu_pmu_restore_guest(vcpu); + if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) + kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); + + if (single_task_running()) + vcpu_clear_wfx_traps(vcpu); + else + vcpu_set_wfx_traps(vcpu); + + vcpu_ptrauth_setup_lazy(vcpu); +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + kvm_arch_vcpu_put_fp(vcpu); + kvm_vcpu_put_sysregs(vcpu); + kvm_timer_vcpu_put(vcpu); + kvm_vgic_put(vcpu); + kvm_vcpu_pmu_restore_host(vcpu); + + vcpu->cpu = -1; +} + +static void vcpu_power_off(struct kvm_vcpu *vcpu) +{ + vcpu->arch.power_off = true; + kvm_make_request(KVM_REQ_SLEEP, vcpu); + kvm_vcpu_kick(vcpu); +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + if (vcpu->arch.power_off) + mp_state->mp_state = KVM_MP_STATE_STOPPED; + else + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + int ret = 0; + + switch (mp_state->mp_state) { + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.power_off = false; + break; + case KVM_MP_STATE_STOPPED: + vcpu_power_off(vcpu); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/** + * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled + * @v: The VCPU pointer + * + * If the guest CPU is not waiting for interrupts or an interrupt line is + * asserted, the CPU is by definition runnable. + */ +int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) +{ + bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); + return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) + && !v->arch.power_off && !v->arch.pause); +} + +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu_mode_priv(vcpu); +} + +/* Just ensure a guest exit from a particular CPU */ +static void exit_vm_noop(void *info) +{ +} + +void force_vm_exit(const cpumask_t *mask) +{ + preempt_disable(); + smp_call_function_many(mask, exit_vm_noop, NULL, true); + preempt_enable(); +} + +/** + * need_new_vmid_gen - check that the VMID is still valid + * @vmid: The VMID to check + * + * return true if there is a new generation of VMIDs being used + * + * The hardware supports a limited set of values with the value zero reserved + * for the host, so we check if an assigned value belongs to a previous + * generation, which which requires us to assign a new value. If we're the + * first to use a VMID for the new generation, we must flush necessary caches + * and TLBs on all CPUs. + */ +static bool need_new_vmid_gen(struct kvm_vmid *vmid) +{ + u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); + smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ + return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen); +} + +/** + * update_vmid - Update the vmid with a valid VMID for the current generation + * @kvm: The guest that struct vmid belongs to + * @vmid: The stage-2 VMID information struct + */ +static void update_vmid(struct kvm_vmid *vmid) +{ + if (!need_new_vmid_gen(vmid)) + return; + + spin_lock(&kvm_vmid_lock); + + /* + * We need to re-check the vmid_gen here to ensure that if another vcpu + * already allocated a valid vmid for this vm, then this vcpu should + * use the same vmid. + */ + if (!need_new_vmid_gen(vmid)) { + spin_unlock(&kvm_vmid_lock); + return; + } + + /* First user of a new VMID generation? */ + if (unlikely(kvm_next_vmid == 0)) { + atomic64_inc(&kvm_vmid_gen); + kvm_next_vmid = 1; + + /* + * On SMP we know no other CPUs can use this CPU's or each + * other's VMID after force_vm_exit returns since the + * kvm_vmid_lock blocks them from reentry to the guest. + */ + force_vm_exit(cpu_all_mask); + /* + * Now broadcast TLB + ICACHE invalidation over the inner + * shareable domain to make sure all data structures are + * clean. + */ + kvm_call_hyp(__kvm_flush_vm_context); + } + + vmid->vmid = kvm_next_vmid; + kvm_next_vmid++; + kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; + + smp_wmb(); + WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen)); + + spin_unlock(&kvm_vmid_lock); +} + +static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret = 0; + + if (likely(vcpu->arch.has_run_once)) + return 0; + + if (!kvm_arm_vcpu_is_finalized(vcpu)) + return -EPERM; + + vcpu->arch.has_run_once = true; + + if (likely(irqchip_in_kernel(kvm))) { + /* + * Map the VGIC hardware resources before running a vcpu the + * first time on this VM. + */ + if (unlikely(!vgic_ready(kvm))) { + ret = kvm_vgic_map_resources(kvm); + if (ret) + return ret; + } + } else { + /* + * Tell the rest of the code that there are userspace irqchip + * VMs in the wild. + */ + static_branch_inc(&userspace_irqchip_in_use); + } + + ret = kvm_timer_enable(vcpu); + if (ret) + return ret; + + ret = kvm_arm_pmu_v3_enable(vcpu); + + return ret; +} + +bool kvm_arch_intc_initialized(struct kvm *kvm) +{ + return vgic_initialized(kvm); +} + +void kvm_arm_halt_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.pause = true; + kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP); +} + +void kvm_arm_resume_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.pause = false; + swake_up_one(kvm_arch_vcpu_wq(vcpu)); + } +} + +static void vcpu_req_sleep(struct kvm_vcpu *vcpu) +{ + struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); + + swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) && + (!vcpu->arch.pause))); + + if (vcpu->arch.power_off || vcpu->arch.pause) { + /* Awaken to handle a signal, request we sleep again later. */ + kvm_make_request(KVM_REQ_SLEEP, vcpu); + } + + /* + * Make sure we will observe a potential reset request if we've + * observed a change to the power state. Pairs with the smp_wmb() in + * kvm_psci_vcpu_on(). + */ + smp_rmb(); +} + +static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.target >= 0; +} + +static void check_vcpu_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) + vcpu_req_sleep(vcpu); + + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + + /* + * Clear IRQ_PENDING requests that were made to guarantee + * that a VCPU sees new virtual interrupts. + */ + kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); + + if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) + kvm_update_stolen_time(vcpu); + + if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { + /* The distributor enable bits were changed */ + preempt_disable(); + vgic_v4_put(vcpu, false); + vgic_v4_load(vcpu); + preempt_enable(); + } + } +} + +/** + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code + * @vcpu: The VCPU pointer + * @run: The kvm_run structure pointer used for userspace state exchange + * + * This function is called through the VCPU_RUN ioctl called from user space. It + * will execute VM code in a loop until the time slice for the process is used + * or some emulation is needed from user space in which case the function will + * return with return value 0 and with the kvm_run structure filled in with the + * required data for the requested emulation. + */ +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + int ret; + + if (unlikely(!kvm_vcpu_initialized(vcpu))) + return -ENOEXEC; + + ret = kvm_vcpu_first_run_init(vcpu); + if (ret) + return ret; + + if (run->exit_reason == KVM_EXIT_MMIO) { + ret = kvm_handle_mmio_return(vcpu, vcpu->run); + if (ret) + return ret; + } + + if (run->immediate_exit) + return -EINTR; + + vcpu_load(vcpu); + + kvm_sigset_activate(vcpu); + + ret = 1; + run->exit_reason = KVM_EXIT_UNKNOWN; + while (ret > 0) { + /* + * Check conditions before entering the guest + */ + cond_resched(); + + update_vmid(&vcpu->kvm->arch.vmid); + + check_vcpu_requests(vcpu); + + /* + * Preparing the interrupts to be injected also + * involves poking the GIC, which must be done in a + * non-preemptible context. + */ + preempt_disable(); + + kvm_pmu_flush_hwstate(vcpu); + + local_irq_disable(); + + kvm_vgic_flush_hwstate(vcpu); + + /* + * Exit if we have a signal pending so that we can deliver the + * signal to user space. + */ + if (signal_pending(current)) { + ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + } + + /* + * If we're using a userspace irqchip, then check if we need + * to tell a userspace irqchip about timer or PMU level + * changes and if so, exit to userspace (the actual level + * state gets updated in kvm_timer_update_run and + * kvm_pmu_update_run below). + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { + ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + } + } + + /* + * Ensure we set mode to IN_GUEST_MODE after we disable + * interrupts and before the final VCPU requests check. + * See the comment in kvm_vcpu_exiting_guest_mode() and + * Documentation/virt/kvm/vcpu-requests.rst + */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); + + if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) || + kvm_request_pending(vcpu)) { + vcpu->mode = OUTSIDE_GUEST_MODE; + isb(); /* Ensure work in x_flush_hwstate is committed */ + kvm_pmu_sync_hwstate(vcpu); + if (static_branch_unlikely(&userspace_irqchip_in_use)) + kvm_timer_sync_hwstate(vcpu); + kvm_vgic_sync_hwstate(vcpu); + local_irq_enable(); + preempt_enable(); + continue; + } + + kvm_arm_setup_debug(vcpu); + + /************************************************************** + * Enter the guest + */ + trace_kvm_entry(*vcpu_pc(vcpu)); + guest_enter_irqoff(); + + if (has_vhe()) { + ret = kvm_vcpu_run_vhe(vcpu); + } else { + ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu); + } + + vcpu->mode = OUTSIDE_GUEST_MODE; + vcpu->stat.exits++; + /* + * Back from guest + *************************************************************/ + + kvm_arm_clear_debug(vcpu); + + /* + * We must sync the PMU state before the vgic state so + * that the vgic can properly sample the updated state of the + * interrupt line. + */ + kvm_pmu_sync_hwstate(vcpu); + + /* + * Sync the vgic state before syncing the timer state because + * the timer code needs to know if the virtual timer + * interrupts are active. + */ + kvm_vgic_sync_hwstate(vcpu); + + /* + * Sync the timer hardware state before enabling interrupts as + * we don't want vtimer interrupts to race with syncing the + * timer virtual interrupt state. + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) + kvm_timer_sync_hwstate(vcpu); + + kvm_arch_vcpu_ctxsync_fp(vcpu); + + /* + * We may have taken a host interrupt in HYP mode (ie + * while executing the guest). This interrupt is still + * pending, as we haven't serviced it yet! + * + * We're now back in SVC mode, with interrupts + * disabled. Enabling the interrupts now will have + * the effect of taking the interrupt again, in SVC + * mode this time. + */ + local_irq_enable(); + + /* + * We do local_irq_enable() before calling guest_exit() so + * that if a timer interrupt hits while running the guest we + * account that tick as being spent in the guest. We enable + * preemption after calling guest_exit() so that if we get + * preempted we make sure ticks after that is not counted as + * guest time. + */ + guest_exit(); + trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + + /* Exit types that need handling before we can be preempted */ + handle_exit_early(vcpu, run, ret); + + preempt_enable(); + + ret = handle_exit(vcpu, run, ret); + } + + /* Tell userspace about in-kernel device output levels */ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { + kvm_timer_update_run(vcpu); + kvm_pmu_update_run(vcpu); + } + + kvm_sigset_deactivate(vcpu); + + vcpu_put(vcpu); + return ret; +} + +static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) +{ + int bit_index; + bool set; + unsigned long *hcr; + + if (number == KVM_ARM_IRQ_CPU_IRQ) + bit_index = __ffs(HCR_VI); + else /* KVM_ARM_IRQ_CPU_FIQ */ + bit_index = __ffs(HCR_VF); + + hcr = vcpu_hcr(vcpu); + if (level) + set = test_and_set_bit(bit_index, hcr); + else + set = test_and_clear_bit(bit_index, hcr); + + /* + * If we didn't change anything, no need to wake up or kick other CPUs + */ + if (set == level) + return 0; + + /* + * The vcpu irq_lines field was updated, wake up sleeping VCPUs and + * trigger a world-switch round on the running physical CPU to set the + * virtual IRQ/FIQ fields in the HCR appropriately. + */ + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return 0; +} + +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, + bool line_status) +{ + u32 irq = irq_level->irq; + unsigned int irq_type, vcpu_idx, irq_num; + int nrcpus = atomic_read(&kvm->online_vcpus); + struct kvm_vcpu *vcpu = NULL; + bool level = irq_level->level; + + irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; + vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; + vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); + irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; + + trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level); + + switch (irq_type) { + case KVM_ARM_IRQ_TYPE_CPU: + if (irqchip_in_kernel(kvm)) + return -ENXIO; + + if (vcpu_idx >= nrcpus) + return -EINVAL; + + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (!vcpu) + return -EINVAL; + + if (irq_num > KVM_ARM_IRQ_CPU_FIQ) + return -EINVAL; + + return vcpu_interrupt_line(vcpu, irq_num, level); + case KVM_ARM_IRQ_TYPE_PPI: + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + if (vcpu_idx >= nrcpus) + return -EINVAL; + + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (!vcpu) + return -EINVAL; + + if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL); + case KVM_ARM_IRQ_TYPE_SPI: + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + if (irq_num < VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL); + } + + return -EINVAL; +} + +static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned int i, ret; + int phys_target = kvm_target_cpu(); + + if (init->target != phys_target) + return -EINVAL; + + /* + * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must + * use the same target. + */ + if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) + return -EINVAL; + + /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ + for (i = 0; i < sizeof(init->features) * 8; i++) { + bool set = (init->features[i / 32] & (1 << (i % 32))); + + if (set && i >= KVM_VCPU_MAX_FEATURES) + return -ENOENT; + + /* + * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must + * use the same feature set. + */ + if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && + test_bit(i, vcpu->arch.features) != set) + return -EINVAL; + + if (set) + set_bit(i, vcpu->arch.features); + } + + vcpu->arch.target = phys_target; + + /* Now we know what it is, we can reset it. */ + ret = kvm_reset_vcpu(vcpu); + if (ret) { + vcpu->arch.target = -1; + bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + } + + return ret; +} + +static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, + struct kvm_vcpu_init *init) +{ + int ret; + + ret = kvm_vcpu_set_target(vcpu, init); + if (ret) + return ret; + + /* + * Ensure a rebooted VM will fault in RAM pages and detect if the + * guest MMU is turned off and flush the caches as needed. + */ + if (vcpu->arch.has_run_once) + stage2_unmap_vm(vcpu->kvm); + + vcpu_reset_hcr(vcpu); + + /* + * Handle the "start in power-off" case. + */ + if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) + vcpu_power_off(vcpu); + else + vcpu->arch.power_off = false; + + return 0; +} + +static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + default: + ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr); + break; + } + + return ret; +} + +static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + default: + ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr); + break; + } + + return ret; +} + +static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + default: + ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr); + break; + } + + return ret; +} + +static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + memset(events, 0, sizeof(*events)); + + return __kvm_arm_vcpu_get_events(vcpu, events); +} + +static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + int i; + + /* check whether the reserved field is zero */ + for (i = 0; i < ARRAY_SIZE(events->reserved); i++) + if (events->reserved[i]) + return -EINVAL; + + /* check whether the pad field is zero */ + for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) + if (events->exception.pad[i]) + return -EINVAL; + + return __kvm_arm_vcpu_set_events(vcpu, events); +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + struct kvm_device_attr attr; + long r; + + switch (ioctl) { + case KVM_ARM_VCPU_INIT: { + struct kvm_vcpu_init init; + + r = -EFAULT; + if (copy_from_user(&init, argp, sizeof(init))) + break; + + r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); + break; + } + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: { + struct kvm_one_reg reg; + + r = -ENOEXEC; + if (unlikely(!kvm_vcpu_initialized(vcpu))) + break; + + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + break; + + if (ioctl == KVM_SET_ONE_REG) + r = kvm_arm_set_reg(vcpu, ®); + else + r = kvm_arm_get_reg(vcpu, ®); + break; + } + case KVM_GET_REG_LIST: { + struct kvm_reg_list __user *user_list = argp; + struct kvm_reg_list reg_list; + unsigned n; + + r = -ENOEXEC; + if (unlikely(!kvm_vcpu_initialized(vcpu))) + break; + + r = -EPERM; + if (!kvm_arm_vcpu_is_finalized(vcpu)) + break; + + r = -EFAULT; + if (copy_from_user(®_list, user_list, sizeof(reg_list))) + break; + n = reg_list.n; + reg_list.n = kvm_arm_num_regs(vcpu); + if (copy_to_user(user_list, ®_list, sizeof(reg_list))) + break; + r = -E2BIG; + if (n < reg_list.n) + break; + r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); + break; + } + case KVM_SET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_arm_vcpu_set_attr(vcpu, &attr); + break; + } + case KVM_GET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_arm_vcpu_get_attr(vcpu, &attr); + break; + } + case KVM_HAS_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_arm_vcpu_has_attr(vcpu, &attr); + break; + } + case KVM_GET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + if (kvm_arm_vcpu_get_events(vcpu, &events)) + return -EINVAL; + + if (copy_to_user(argp, &events, sizeof(events))) + return -EFAULT; + + return 0; + } + case KVM_SET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + if (copy_from_user(&events, argp, sizeof(events))) + return -EFAULT; + + return kvm_arm_vcpu_set_events(vcpu, &events); + } + case KVM_ARM_VCPU_FINALIZE: { + int what; + + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + + if (get_user(what, (const int __user *)argp)) + return -EFAULT; + + return kvm_arm_vcpu_finalize(vcpu, what); + } + default: + r = -EINVAL; + } + + return r; +} + +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + +} + +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + kvm_flush_remote_tlbs(kvm); +} + +static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, + struct kvm_arm_device_addr *dev_addr) +{ + unsigned long dev_id, type; + + dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >> + KVM_ARM_DEVICE_ID_SHIFT; + type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >> + KVM_ARM_DEVICE_TYPE_SHIFT; + + switch (dev_id) { + case KVM_ARM_DEVICE_VGIC_V2: + if (!vgic_present) + return -ENXIO; + return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); + default: + return -ENODEV; + } +} + +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + + switch (ioctl) { + case KVM_CREATE_IRQCHIP: { + int ret; + if (!vgic_present) + return -ENXIO; + mutex_lock(&kvm->lock); + ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + mutex_unlock(&kvm->lock); + return ret; + } + case KVM_ARM_SET_DEVICE_ADDR: { + struct kvm_arm_device_addr dev_addr; + + if (copy_from_user(&dev_addr, argp, sizeof(dev_addr))) + return -EFAULT; + return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); + } + case KVM_ARM_PREFERRED_TARGET: { + int err; + struct kvm_vcpu_init init; + + err = kvm_vcpu_preferred_target(&init); + if (err) + return err; + + if (copy_to_user(argp, &init, sizeof(init))) + return -EFAULT; + + return 0; + } + default: + return -EINVAL; + } +} + +static void cpu_init_hyp_mode(void) +{ + phys_addr_t pgd_ptr; + unsigned long hyp_stack_ptr; + unsigned long stack_page; + unsigned long vector_ptr; + + /* Switch from the HYP stub to our own HYP init vector */ + __hyp_set_vectors(kvm_get_idmap_vector()); + + pgd_ptr = kvm_mmu_get_httbr(); + stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); + hyp_stack_ptr = stack_page + PAGE_SIZE; + vector_ptr = (unsigned long)kvm_get_hyp_vector(); + + __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); + __cpu_init_stage2(); +} + +static void cpu_hyp_reset(void) +{ + if (!is_kernel_in_hyp_mode()) + __hyp_reset_vectors(); +} + +static void cpu_hyp_reinit(void) +{ + kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt); + + cpu_hyp_reset(); + + if (is_kernel_in_hyp_mode()) + kvm_timer_init_vhe(); + else + cpu_init_hyp_mode(); + + kvm_arm_init_debug(); + + if (vgic_present) + kvm_vgic_init_cpu_hardware(); +} + +static void _kvm_arch_hardware_enable(void *discard) +{ + if (!__this_cpu_read(kvm_arm_hardware_enabled)) { + cpu_hyp_reinit(); + __this_cpu_write(kvm_arm_hardware_enabled, 1); + } +} + +int kvm_arch_hardware_enable(void) +{ + _kvm_arch_hardware_enable(NULL); + return 0; +} + +static void _kvm_arch_hardware_disable(void *discard) +{ + if (__this_cpu_read(kvm_arm_hardware_enabled)) { + cpu_hyp_reset(); + __this_cpu_write(kvm_arm_hardware_enabled, 0); + } +} + +void kvm_arch_hardware_disable(void) +{ + _kvm_arch_hardware_disable(NULL); +} + +#ifdef CONFIG_CPU_PM +static int hyp_init_cpu_pm_notifier(struct notifier_block *self, + unsigned long cmd, + void *v) +{ + /* + * kvm_arm_hardware_enabled is left with its old value over + * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should + * re-enable hyp. + */ + switch (cmd) { + case CPU_PM_ENTER: + if (__this_cpu_read(kvm_arm_hardware_enabled)) + /* + * don't update kvm_arm_hardware_enabled here + * so that the hardware will be re-enabled + * when we resume. See below. + */ + cpu_hyp_reset(); + + return NOTIFY_OK; + case CPU_PM_ENTER_FAILED: + case CPU_PM_EXIT: + if (__this_cpu_read(kvm_arm_hardware_enabled)) + /* The hardware was enabled before suspend. */ + cpu_hyp_reinit(); + + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block hyp_init_cpu_pm_nb = { + .notifier_call = hyp_init_cpu_pm_notifier, +}; + +static void __init hyp_cpu_pm_init(void) +{ + cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); +} +static void __init hyp_cpu_pm_exit(void) +{ + cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); +} +#else +static inline void hyp_cpu_pm_init(void) +{ +} +static inline void hyp_cpu_pm_exit(void) +{ +} +#endif + +static int init_common_resources(void) +{ + kvm_set_ipa_limit(); + + return 0; +} + +static int init_subsystems(void) +{ + int err = 0; + + /* + * Enable hardware so that subsystem initialisation can access EL2. + */ + on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); + + /* + * Register CPU lower-power notifier + */ + hyp_cpu_pm_init(); + + /* + * Init HYP view of VGIC + */ + err = kvm_vgic_hyp_init(); + switch (err) { + case 0: + vgic_present = true; + break; + case -ENODEV: + case -ENXIO: + vgic_present = false; + err = 0; + break; + default: + goto out; + } + + /* + * Init HYP architected timer support + */ + err = kvm_timer_hyp_init(vgic_present); + if (err) + goto out; + + kvm_perf_init(); + kvm_coproc_table_init(); + +out: + on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); + + return err; +} + +static void teardown_hyp_mode(void) +{ + int cpu; + + free_hyp_pgds(); + for_each_possible_cpu(cpu) + free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); +} + +/** + * Inits Hyp-mode on all online CPUs + */ +static int init_hyp_mode(void) +{ + int cpu; + int err = 0; + + /* + * Allocate Hyp PGD and setup Hyp identity mapping + */ + err = kvm_mmu_init(); + if (err) + goto out_err; + + /* + * Allocate stack pages for Hypervisor-mode + */ + for_each_possible_cpu(cpu) { + unsigned long stack_page; + + stack_page = __get_free_page(GFP_KERNEL); + if (!stack_page) { + err = -ENOMEM; + goto out_err; + } + + per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; + } + + /* + * Map the Hyp-code called directly from the host + */ + err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), + kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); + if (err) { + kvm_err("Cannot map world-switch code\n"); + goto out_err; + } + + err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), + kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); + if (err) { + kvm_err("Cannot map rodata section\n"); + goto out_err; + } + + err = create_hyp_mappings(kvm_ksym_ref(__bss_start), + kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); + if (err) { + kvm_err("Cannot map bss section\n"); + goto out_err; + } + + err = kvm_map_vectors(); + if (err) { + kvm_err("Cannot map vectors\n"); + goto out_err; + } + + /* + * Map the Hyp stack pages + */ + for_each_possible_cpu(cpu) { + char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); + err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, + PAGE_HYP); + + if (err) { + kvm_err("Cannot map hyp stack\n"); + goto out_err; + } + } + + for_each_possible_cpu(cpu) { + kvm_host_data_t *cpu_data; + + cpu_data = per_cpu_ptr(&kvm_host_data, cpu); + err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP); + + if (err) { + kvm_err("Cannot map host CPU state: %d\n", err); + goto out_err; + } + } + + err = hyp_map_aux_data(); + if (err) + kvm_err("Cannot map host auxiliary data: %d\n", err); + + return 0; + +out_err: + teardown_hyp_mode(); + kvm_err("error initializing Hyp mode: %d\n", err); + return err; +} + +static void check_kvm_target_cpu(void *ret) +{ + *(int *)ret = kvm_target_cpu(); +} + +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) +{ + struct kvm_vcpu *vcpu; + int i; + + mpidr &= MPIDR_HWID_BITMASK; + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) + return vcpu; + } + return NULL; +} + +bool kvm_arch_has_irq_bypass(void) +{ + return true; +} + +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, + &irqfd->irq_entry); +} +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, + &irqfd->irq_entry); +} + +void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_arm_halt_guest(irqfd->kvm); +} + +void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_arm_resume_guest(irqfd->kvm); +} + +/** + * Initialize Hyp-mode and memory mappings on all CPUs. + */ +int kvm_arch_init(void *opaque) +{ + int err; + int ret, cpu; + bool in_hyp_mode; + + if (!is_hyp_mode_available()) { + kvm_info("HYP mode not available\n"); + return -ENODEV; + } + + in_hyp_mode = is_kernel_in_hyp_mode(); + + if (!in_hyp_mode && kvm_arch_requires_vhe()) { + kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n"); + return -ENODEV; + } + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); + if (ret < 0) { + kvm_err("Error, CPU %d not supported!\n", cpu); + return -ENODEV; + } + } + + err = init_common_resources(); + if (err) + return err; + + err = kvm_arm_init_sve(); + if (err) + return err; + + if (!in_hyp_mode) { + err = init_hyp_mode(); + if (err) + goto out_err; + } + + err = init_subsystems(); + if (err) + goto out_hyp; + + if (in_hyp_mode) + kvm_info("VHE mode initialized successfully\n"); + else + kvm_info("Hyp mode initialized successfully\n"); + + return 0; + +out_hyp: + hyp_cpu_pm_exit(); + if (!in_hyp_mode) + teardown_hyp_mode(); +out_err: + return err; +} + +/* NOP: Compiling as a module not supported */ +void kvm_arch_exit(void) +{ + kvm_perf_teardown(); +} + +static int arm_init(void) +{ + int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + return rc; +} + +module_init(arm_init); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index aacfc55de44c..eb194696ef62 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -23,7 +23,7 @@ #include #define CREATE_TRACE_POINTS -#include "trace.h" +#include "trace_handle_exit.h" typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index ea710f674cb6..dc18274a6826 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -6,12 +6,9 @@ ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -KVM=../../../../virt/kvm - -obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/aarch32.o - +obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o +obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o +obj-$(CONFIG_KVM_ARM_HOST) += aarch32.o obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-cpuif-proxy.o obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c new file mode 100644 index 000000000000..25c0e47d57cb --- /dev/null +++ b/arch/arm64/kvm/hyp/aarch32.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hyp portion of the (not much of an) Emulation layer for 32bit guests. + * + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier + * + * based on arch/arm/kvm/emulate.c + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall + */ + +#include +#include +#include + +/* + * stolen from arch/arm/kernel/opcodes.c + * + * condition code lookup table + * index into the table is test code: EQ, NE, ... LT, GT, AL, NV + * + * bit position in short is condition code: NZCV + */ +static const unsigned short cc_map[16] = { + 0xF0F0, /* EQ == Z set */ + 0x0F0F, /* NE */ + 0xCCCC, /* CS == C set */ + 0x3333, /* CC */ + 0xFF00, /* MI == N set */ + 0x00FF, /* PL */ + 0xAAAA, /* VS == V set */ + 0x5555, /* VC */ + 0x0C0C, /* HI == C set && Z clear */ + 0xF3F3, /* LS == C clear || Z set */ + 0xAA55, /* GE == (N==V) */ + 0x55AA, /* LT == (N!=V) */ + 0x0A05, /* GT == (!Z && (N==V)) */ + 0xF5FA, /* LE == (Z || (N!=V)) */ + 0xFFFF, /* AL always */ + 0 /* NV */ +}; + +/* + * Check if a trapped instruction should have been executed or not. + */ +bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu) +{ + unsigned long cpsr; + u32 cpsr_cond; + int cond; + + /* Top two bits non-zero? Unconditional. */ + if (kvm_vcpu_get_hsr(vcpu) >> 30) + return true; + + /* Is condition field valid? */ + cond = kvm_vcpu_get_condition(vcpu); + if (cond == 0xE) + return true; + + cpsr = *vcpu_cpsr(vcpu); + + if (cond < 0) { + /* This can happen in Thumb mode: examine IT state. */ + unsigned long it; + + it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); + + /* it == 0 => unconditional. */ + if (it == 0) + return true; + + /* The cond for this insn works out as the top 4 bits. */ + cond = (it >> 4); + } + + cpsr_cond = cpsr >> 28; + + if (!((cc_map[cond] >> cpsr_cond) & 1)) + return false; + + return true; +} + +/** + * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block + * @vcpu: The VCPU pointer + * + * When exceptions occur while instructions are executed in Thumb IF-THEN + * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have + * to do this little bit of work manually. The fields map like this: + * + * IT[7:0] -> CPSR[26:25],CPSR[15:10] + */ +static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu) +{ + unsigned long itbits, cond; + unsigned long cpsr = *vcpu_cpsr(vcpu); + bool is_arm = !(cpsr & PSR_AA32_T_BIT); + + if (is_arm || !(cpsr & PSR_AA32_IT_MASK)) + return; + + cond = (cpsr & 0xe000) >> 13; + itbits = (cpsr & 0x1c00) >> (10 - 2); + itbits |= (cpsr & (0x3 << 25)) >> 25; + + /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ + if ((itbits & 0x7) == 0) + itbits = cond = 0; + else + itbits = (itbits << 1) & 0x1f; + + cpsr &= ~PSR_AA32_IT_MASK; + cpsr |= cond << 13; + cpsr |= (itbits & 0x1c) << (10 - 2); + cpsr |= (itbits & 0x3) << 25; + *vcpu_cpsr(vcpu) = cpsr; +} + +/** + * kvm_skip_instr - skip a trapped instruction and proceed to the next + * @vcpu: The vcpu pointer + */ +void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) +{ + u32 pc = *vcpu_pc(vcpu); + bool is_thumb; + + is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT); + if (is_thumb && !is_wide_instr) + pc += 2; + else + pc += 4; + + *vcpu_pc(vcpu) = pc; + + kvm_adjust_itstate(vcpu); +} diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c new file mode 100644 index 000000000000..ff76e6845fe4 --- /dev/null +++ b/arch/arm64/kvm/hyp/timer-sr.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier + */ + +#include +#include +#include + +#include + +void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high) +{ + u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low; + write_sysreg(cntvoff, cntvoff_el2); +} + +/* + * Should only be called on non-VHE systems. + * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). + */ +void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* Allow physical timer/counter access for the host */ + val = read_sysreg(cnthctl_el2); + val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; + write_sysreg(val, cnthctl_el2); +} + +/* + * Should only be called on non-VHE systems. + * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). + */ +void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* + * Disallow physical timer access for the guest + * Physical counter access is allowed + */ + val = read_sysreg(cnthctl_el2); + val &= ~CNTHCTL_EL1PCEN; + val |= CNTHCTL_EL1PCTEN; + write_sysreg(val, cnthctl_el2); +} diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c new file mode 100644 index 000000000000..49fedf6710f9 --- /dev/null +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -0,0 +1,1126 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier + */ + +#include +#include +#include + +#include +#include +#include + +#define vtr_to_max_lr_idx(v) ((v) & 0xf) +#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) +#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) + +static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) +{ + switch (lr & 0xf) { + case 0: + return read_gicreg(ICH_LR0_EL2); + case 1: + return read_gicreg(ICH_LR1_EL2); + case 2: + return read_gicreg(ICH_LR2_EL2); + case 3: + return read_gicreg(ICH_LR3_EL2); + case 4: + return read_gicreg(ICH_LR4_EL2); + case 5: + return read_gicreg(ICH_LR5_EL2); + case 6: + return read_gicreg(ICH_LR6_EL2); + case 7: + return read_gicreg(ICH_LR7_EL2); + case 8: + return read_gicreg(ICH_LR8_EL2); + case 9: + return read_gicreg(ICH_LR9_EL2); + case 10: + return read_gicreg(ICH_LR10_EL2); + case 11: + return read_gicreg(ICH_LR11_EL2); + case 12: + return read_gicreg(ICH_LR12_EL2); + case 13: + return read_gicreg(ICH_LR13_EL2); + case 14: + return read_gicreg(ICH_LR14_EL2); + case 15: + return read_gicreg(ICH_LR15_EL2); + } + + unreachable(); +} + +static void __hyp_text __gic_v3_set_lr(u64 val, int lr) +{ + switch (lr & 0xf) { + case 0: + write_gicreg(val, ICH_LR0_EL2); + break; + case 1: + write_gicreg(val, ICH_LR1_EL2); + break; + case 2: + write_gicreg(val, ICH_LR2_EL2); + break; + case 3: + write_gicreg(val, ICH_LR3_EL2); + break; + case 4: + write_gicreg(val, ICH_LR4_EL2); + break; + case 5: + write_gicreg(val, ICH_LR5_EL2); + break; + case 6: + write_gicreg(val, ICH_LR6_EL2); + break; + case 7: + write_gicreg(val, ICH_LR7_EL2); + break; + case 8: + write_gicreg(val, ICH_LR8_EL2); + break; + case 9: + write_gicreg(val, ICH_LR9_EL2); + break; + case 10: + write_gicreg(val, ICH_LR10_EL2); + break; + case 11: + write_gicreg(val, ICH_LR11_EL2); + break; + case 12: + write_gicreg(val, ICH_LR12_EL2); + break; + case 13: + write_gicreg(val, ICH_LR13_EL2); + break; + case 14: + write_gicreg(val, ICH_LR14_EL2); + break; + case 15: + write_gicreg(val, ICH_LR15_EL2); + break; + } +} + +static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n) +{ + switch (n) { + case 0: + write_gicreg(val, ICH_AP0R0_EL2); + break; + case 1: + write_gicreg(val, ICH_AP0R1_EL2); + break; + case 2: + write_gicreg(val, ICH_AP0R2_EL2); + break; + case 3: + write_gicreg(val, ICH_AP0R3_EL2); + break; + } +} + +static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n) +{ + switch (n) { + case 0: + write_gicreg(val, ICH_AP1R0_EL2); + break; + case 1: + write_gicreg(val, ICH_AP1R1_EL2); + break; + case 2: + write_gicreg(val, ICH_AP1R2_EL2); + break; + case 3: + write_gicreg(val, ICH_AP1R3_EL2); + break; + } +} + +static u32 __hyp_text __vgic_v3_read_ap0rn(int n) +{ + u32 val; + + switch (n) { + case 0: + val = read_gicreg(ICH_AP0R0_EL2); + break; + case 1: + val = read_gicreg(ICH_AP0R1_EL2); + break; + case 2: + val = read_gicreg(ICH_AP0R2_EL2); + break; + case 3: + val = read_gicreg(ICH_AP0R3_EL2); + break; + default: + unreachable(); + } + + return val; +} + +static u32 __hyp_text __vgic_v3_read_ap1rn(int n) +{ + u32 val; + + switch (n) { + case 0: + val = read_gicreg(ICH_AP1R0_EL2); + break; + case 1: + val = read_gicreg(ICH_AP1R1_EL2); + break; + case 2: + val = read_gicreg(ICH_AP1R2_EL2); + break; + case 3: + val = read_gicreg(ICH_AP1R3_EL2); + break; + default: + unreachable(); + } + + return val; +} + +void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + + /* + * Make sure stores to the GIC via the memory mapped interface + * are now visible to the system register interface when reading the + * LRs, and when reading back the VMCR on non-VHE systems. + */ + if (used_lrs || !has_vhe()) { + if (!cpu_if->vgic_sre) { + dsb(sy); + isb(); + } + } + + if (used_lrs || cpu_if->its_vpe.its_vm) { + int i; + u32 elrsr; + + elrsr = read_gicreg(ICH_ELRSR_EL2); + + write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); + + for (i = 0; i < used_lrs; i++) { + if (elrsr & (1 << i)) + cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; + else + cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); + + __gic_v3_set_lr(0, i); + } + } +} + +void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + int i; + + if (used_lrs || cpu_if->its_vpe.its_vm) { + write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + + for (i = 0; i < used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); + } + + /* + * Ensure that writes to the LRs, and on non-VHE systems ensure that + * the write to the VMCR in __vgic_v3_activate_traps(), will have + * reached the (re)distributors. This ensure the guest will read the + * correct values from the memory-mapped interface. + */ + if (used_lrs || !has_vhe()) { + if (!cpu_if->vgic_sre) { + isb(); + dsb(sy); + } + } +} + +void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + /* + * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a + * Group0 interrupt (as generated in GICv2 mode) to be + * delivered as a FIQ to the guest, with potentially fatal + * consequences. So we must make sure that ICC_SRE_EL1 has + * been actually programmed with the value we want before + * starting to mess with the rest of the GIC, and VMCR_EL2 in + * particular. This logic must be called before + * __vgic_v3_restore_state(). + */ + if (!cpu_if->vgic_sre) { + write_gicreg(0, ICC_SRE_EL1); + isb(); + write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); + + + if (has_vhe()) { + /* + * Ensure that the write to the VMCR will have reached + * the (re)distributors. This ensure the guest will + * read the correct values from the memory-mapped + * interface. + */ + isb(); + dsb(sy); + } + } + + /* + * Prevent the guest from touching the GIC system registers if + * SRE isn't enabled for GICv3 emulation. + */ + write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, + ICC_SRE_EL2); + + /* + * If we need to trap system registers, we must write + * ICH_HCR_EL2 anyway, even if no interrupts are being + * injected, + */ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) || + cpu_if->its_vpe.its_vm) + write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); +} + +void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 val; + + if (!cpu_if->vgic_sre) { + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + } + + val = read_gicreg(ICC_SRE_EL2); + write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); + + if (!cpu_if->vgic_sre) { + /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ + isb(); + write_gicreg(1, ICC_SRE_EL1); + } + + /* + * If we were trapping system registers, we enabled the VGIC even if + * no interrupts were being injected, and we disable it again here. + */ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) || + cpu_if->its_vpe.its_vm) + write_gicreg(0, ICH_HCR_EL2); +} + +void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if; + u64 val; + u32 nr_pre_bits; + + vcpu = kern_hyp_va(vcpu); + cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + val = read_gicreg(ICH_VTR_EL2); + nr_pre_bits = vtr_to_nr_pre_bits(val); + + switch (nr_pre_bits) { + case 7: + cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3); + cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); + /* Fall through */ + case 6: + cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1); + /* Fall through */ + default: + cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0); + } + + switch (nr_pre_bits) { + case 7: + cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3); + cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2); + /* Fall through */ + case 6: + cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1); + /* Fall through */ + default: + cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0); + } +} + +void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if; + u64 val; + u32 nr_pre_bits; + + vcpu = kern_hyp_va(vcpu); + cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + val = read_gicreg(ICH_VTR_EL2); + nr_pre_bits = vtr_to_nr_pre_bits(val); + + switch (nr_pre_bits) { + case 7: + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3); + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2); + /* Fall through */ + case 6: + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1); + /* Fall through */ + default: + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0); + } + + switch (nr_pre_bits) { + case 7: + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3); + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2); + /* Fall through */ + case 6: + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1); + /* Fall through */ + default: + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0); + } +} + +void __hyp_text __vgic_v3_init_lrs(void) +{ + int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); + int i; + + for (i = 0; i <= max_lr_idx; i++) + __gic_v3_set_lr(0, i); +} + +u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) +{ + return read_gicreg(ICH_VTR_EL2); +} + +u64 __hyp_text __vgic_v3_read_vmcr(void) +{ + return read_gicreg(ICH_VMCR_EL2); +} + +void __hyp_text __vgic_v3_write_vmcr(u32 vmcr) +{ + write_gicreg(vmcr, ICH_VMCR_EL2); +} + +static int __hyp_text __vgic_v3_bpr_min(void) +{ + /* See Pseudocode for VPriorityGroup */ + return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2)); +} + +static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu) +{ + u32 esr = kvm_vcpu_get_hsr(vcpu); + u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; + + return crm != 8; +} + +#define GICv3_IDLE_PRIORITY 0xff + +static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, + u32 vmcr, + u64 *lr_val) +{ + unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u8 priority = GICv3_IDLE_PRIORITY; + int i, lr = -1; + + for (i = 0; i < used_lrs; i++) { + u64 val = __gic_v3_get_lr(i); + u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; + + /* Not pending in the state? */ + if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT) + continue; + + /* Group-0 interrupt, but Group-0 disabled? */ + if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK)) + continue; + + /* Group-1 interrupt, but Group-1 disabled? */ + if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK)) + continue; + + /* Not the highest priority? */ + if (lr_prio >= priority) + continue; + + /* This is a candidate */ + priority = lr_prio; + *lr_val = val; + lr = i; + } + + if (lr == -1) + *lr_val = ICC_IAR1_EL1_SPURIOUS; + + return lr; +} + +static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, + int intid, u64 *lr_val) +{ + unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; + int i; + + for (i = 0; i < used_lrs; i++) { + u64 val = __gic_v3_get_lr(i); + + if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid && + (val & ICH_LR_ACTIVE_BIT)) { + *lr_val = val; + return i; + } + } + + *lr_val = ICC_IAR1_EL1_SPURIOUS; + return -1; +} + +static int __hyp_text __vgic_v3_get_highest_active_priority(void) +{ + u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); + u32 hap = 0; + int i; + + for (i = 0; i < nr_apr_regs; i++) { + u32 val; + + /* + * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers + * contain the active priority levels for this VCPU + * for the maximum number of supported priority + * levels, and we return the full priority level only + * if the BPR is programmed to its minimum, otherwise + * we return a combination of the priority level and + * subpriority, as determined by the setting of the + * BPR, but without the full subpriority. + */ + val = __vgic_v3_read_ap0rn(i); + val |= __vgic_v3_read_ap1rn(i); + if (!val) { + hap += 32; + continue; + } + + return (hap + __ffs(val)) << __vgic_v3_bpr_min(); + } + + return GICv3_IDLE_PRIORITY; +} + +static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr) +{ + return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; +} + +static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr) +{ + unsigned int bpr; + + if (vmcr & ICH_VMCR_CBPR_MASK) { + bpr = __vgic_v3_get_bpr0(vmcr); + if (bpr < 7) + bpr++; + } else { + bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + } + + return bpr; +} + +/* + * Convert a priority to a preemption level, taking the relevant BPR + * into account by zeroing the sub-priority bits. + */ +static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp) +{ + unsigned int bpr; + + if (!grp) + bpr = __vgic_v3_get_bpr0(vmcr) + 1; + else + bpr = __vgic_v3_get_bpr1(vmcr); + + return pri & (GENMASK(7, 0) << bpr); +} + +/* + * The priority value is independent of any of the BPR values, so we + * normalize it using the minumal BPR value. This guarantees that no + * matter what the guest does with its BPR, we can always set/get the + * same value of a priority. + */ +static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp) +{ + u8 pre, ap; + u32 val; + int apr; + + pre = __vgic_v3_pri_to_pre(pri, vmcr, grp); + ap = pre >> __vgic_v3_bpr_min(); + apr = ap / 32; + + if (!grp) { + val = __vgic_v3_read_ap0rn(apr); + __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr); + } else { + val = __vgic_v3_read_ap1rn(apr); + __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr); + } +} + +static int __hyp_text __vgic_v3_clear_highest_active_priority(void) +{ + u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); + u32 hap = 0; + int i; + + for (i = 0; i < nr_apr_regs; i++) { + u32 ap0, ap1; + int c0, c1; + + ap0 = __vgic_v3_read_ap0rn(i); + ap1 = __vgic_v3_read_ap1rn(i); + if (!ap0 && !ap1) { + hap += 32; + continue; + } + + c0 = ap0 ? __ffs(ap0) : 32; + c1 = ap1 ? __ffs(ap1) : 32; + + /* Always clear the LSB, which is the highest priority */ + if (c0 < c1) { + ap0 &= ~BIT(c0); + __vgic_v3_write_ap0rn(ap0, i); + hap += c0; + } else { + ap1 &= ~BIT(c1); + __vgic_v3_write_ap1rn(ap1, i); + hap += c1; + } + + /* Rescale to 8 bits of priority */ + return hap << __vgic_v3_bpr_min(); + } + + return GICv3_IDLE_PRIORITY; +} + +static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 lr_val; + u8 lr_prio, pmr; + int lr, grp; + + grp = __vgic_v3_get_group(vcpu); + + lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); + if (lr < 0) + goto spurious; + + if (grp != !!(lr_val & ICH_LR_GROUP)) + goto spurious; + + pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; + if (pmr <= lr_prio) + goto spurious; + + if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp)) + goto spurious; + + lr_val &= ~ICH_LR_STATE; + /* No active state for LPIs */ + if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI) + lr_val |= ICH_LR_ACTIVE_BIT; + __gic_v3_set_lr(lr_val, lr); + __vgic_v3_set_active_priority(lr_prio, vmcr, grp); + vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); + return; + +spurious: + vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS); +} + +static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val) +{ + lr_val &= ~ICH_LR_ACTIVE_BIT; + if (lr_val & ICH_LR_HW) { + u32 pid; + + pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT; + gic_write_dir(pid); + } + + __gic_v3_set_lr(lr_val, lr); +} + +static void __hyp_text __vgic_v3_bump_eoicount(void) +{ + u32 hcr; + + hcr = read_gicreg(ICH_HCR_EL2); + hcr += 1 << ICH_HCR_EOIcount_SHIFT; + write_gicreg(hcr, ICH_HCR_EL2); +} + +static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 vid = vcpu_get_reg(vcpu, rt); + u64 lr_val; + int lr; + + /* EOImode == 0, nothing to be done here */ + if (!(vmcr & ICH_VMCR_EOIM_MASK)) + return; + + /* No deactivate to be performed on an LPI */ + if (vid >= VGIC_MIN_LPI) + return; + + lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); + if (lr == -1) { + __vgic_v3_bump_eoicount(); + return; + } + + __vgic_v3_clear_active_lr(lr, lr_val); +} + +static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u32 vid = vcpu_get_reg(vcpu, rt); + u64 lr_val; + u8 lr_prio, act_prio; + int lr, grp; + + grp = __vgic_v3_get_group(vcpu); + + /* Drop priority in any case */ + act_prio = __vgic_v3_clear_highest_active_priority(); + + /* If EOIing an LPI, no deactivate to be performed */ + if (vid >= VGIC_MIN_LPI) + return; + + /* EOImode == 1, nothing to be done here */ + if (vmcr & ICH_VMCR_EOIM_MASK) + return; + + lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); + if (lr == -1) { + __vgic_v3_bump_eoicount(); + return; + } + + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; + + /* If priorities or group do not match, the guest has fscked-up. */ + if (grp != !!(lr_val & ICH_LR_GROUP) || + __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio) + return; + + /* Let's now perform the deactivation */ + __vgic_v3_clear_active_lr(lr, lr_val); +} + +static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK)); +} + +static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK)); +} + +static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + + if (val & 1) + vmcr |= ICH_VMCR_ENG0_MASK; + else + vmcr &= ~ICH_VMCR_ENG0_MASK; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + + if (val & 1) + vmcr |= ICH_VMCR_ENG1_MASK; + else + vmcr &= ~ICH_VMCR_ENG1_MASK; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr)); +} + +static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr)); +} + +static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + u8 bpr_min = __vgic_v3_bpr_min() - 1; + + /* Enforce BPR limiting */ + if (val < bpr_min) + val = bpr_min; + + val <<= ICH_VMCR_BPR0_SHIFT; + val &= ICH_VMCR_BPR0_MASK; + vmcr &= ~ICH_VMCR_BPR0_MASK; + vmcr |= val; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + u8 bpr_min = __vgic_v3_bpr_min(); + + if (vmcr & ICH_VMCR_CBPR_MASK) + return; + + /* Enforce BPR limiting */ + if (val < bpr_min) + val = bpr_min; + + val <<= ICH_VMCR_BPR1_SHIFT; + val &= ICH_VMCR_BPR1_MASK; + vmcr &= ~ICH_VMCR_BPR1_MASK; + vmcr |= val; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n) +{ + u32 val; + + if (!__vgic_v3_get_group(vcpu)) + val = __vgic_v3_read_ap0rn(n); + else + val = __vgic_v3_read_ap1rn(n); + + vcpu_set_reg(vcpu, rt, val); +} + +static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n) +{ + u32 val = vcpu_get_reg(vcpu, rt); + + if (!__vgic_v3_get_group(vcpu)) + __vgic_v3_write_ap0rn(val, n); + else + __vgic_v3_write_ap1rn(val, n); +} + +static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 0); +} + +static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 1); +} + +static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 2); +} + +static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 3); +} + +static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 0); +} + +static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 1); +} + +static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 2); +} + +static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 3); +} + +static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u64 lr_val; + int lr, lr_grp, grp; + + grp = __vgic_v3_get_group(vcpu); + + lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); + if (lr == -1) + goto spurious; + + lr_grp = !!(lr_val & ICH_LR_GROUP); + if (lr_grp != grp) + lr_val = ICC_IAR1_EL1_SPURIOUS; + +spurious: + vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); +} + +static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + vmcr &= ICH_VMCR_PMR_MASK; + vmcr >>= ICH_VMCR_PMR_SHIFT; + vcpu_set_reg(vcpu, rt, vmcr); +} + +static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 val = vcpu_get_reg(vcpu, rt); + + val <<= ICH_VMCR_PMR_SHIFT; + val &= ICH_VMCR_PMR_MASK; + vmcr &= ~ICH_VMCR_PMR_MASK; + vmcr |= val; + + write_gicreg(vmcr, ICH_VMCR_EL2); +} + +static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 val = __vgic_v3_get_highest_active_priority(); + vcpu_set_reg(vcpu, rt, val); +} + +static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 vtr, val; + + vtr = read_gicreg(ICH_VTR_EL2); + /* PRIbits */ + val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; + /* IDbits */ + val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; + /* SEIS */ + val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT; + /* A3V */ + val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; + /* EOImode */ + val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT; + /* CBPR */ + val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; + + vcpu_set_reg(vcpu, rt, val); +} + +static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 val = vcpu_get_reg(vcpu, rt); + + if (val & ICC_CTLR_EL1_CBPR_MASK) + vmcr |= ICH_VMCR_CBPR_MASK; + else + vmcr &= ~ICH_VMCR_CBPR_MASK; + + if (val & ICC_CTLR_EL1_EOImode_MASK) + vmcr |= ICH_VMCR_EOIM_MASK; + else + vmcr &= ~ICH_VMCR_EOIM_MASK; + + write_gicreg(vmcr, ICH_VMCR_EL2); +} + +int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) +{ + int rt; + u32 esr; + u32 vmcr; + void (*fn)(struct kvm_vcpu *, u32, int); + bool is_read; + u32 sysreg; + + esr = kvm_vcpu_get_hsr(vcpu); + if (vcpu_mode_is_32bit(vcpu)) { + if (!kvm_condition_valid(vcpu)) { + __kvm_skip_instr(vcpu); + return 1; + } + + sysreg = esr_cp15_to_sysreg(esr); + } else { + sysreg = esr_sys64_to_sysreg(esr); + } + + is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + + switch (sysreg) { + case SYS_ICC_IAR0_EL1: + case SYS_ICC_IAR1_EL1: + if (unlikely(!is_read)) + return 0; + fn = __vgic_v3_read_iar; + break; + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_EOIR1_EL1: + if (unlikely(is_read)) + return 0; + fn = __vgic_v3_write_eoir; + break; + case SYS_ICC_IGRPEN1_EL1: + if (is_read) + fn = __vgic_v3_read_igrpen1; + else + fn = __vgic_v3_write_igrpen1; + break; + case SYS_ICC_BPR1_EL1: + if (is_read) + fn = __vgic_v3_read_bpr1; + else + fn = __vgic_v3_write_bpr1; + break; + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(0): + if (is_read) + fn = __vgic_v3_read_apxr0; + else + fn = __vgic_v3_write_apxr0; + break; + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(1): + if (is_read) + fn = __vgic_v3_read_apxr1; + else + fn = __vgic_v3_write_apxr1; + break; + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(2): + if (is_read) + fn = __vgic_v3_read_apxr2; + else + fn = __vgic_v3_write_apxr2; + break; + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_AP1Rn_EL1(3): + if (is_read) + fn = __vgic_v3_read_apxr3; + else + fn = __vgic_v3_write_apxr3; + break; + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_HPPIR1_EL1: + if (unlikely(!is_read)) + return 0; + fn = __vgic_v3_read_hppir; + break; + case SYS_ICC_IGRPEN0_EL1: + if (is_read) + fn = __vgic_v3_read_igrpen0; + else + fn = __vgic_v3_write_igrpen0; + break; + case SYS_ICC_BPR0_EL1: + if (is_read) + fn = __vgic_v3_read_bpr0; + else + fn = __vgic_v3_write_bpr0; + break; + case SYS_ICC_DIR_EL1: + if (unlikely(is_read)) + return 0; + fn = __vgic_v3_write_dir; + break; + case SYS_ICC_RPR_EL1: + if (unlikely(!is_read)) + return 0; + fn = __vgic_v3_read_rpr; + break; + case SYS_ICC_CTLR_EL1: + if (is_read) + fn = __vgic_v3_read_ctlr; + else + fn = __vgic_v3_write_ctlr; + break; + case SYS_ICC_PMR_EL1: + if (is_read) + fn = __vgic_v3_read_pmr; + else + fn = __vgic_v3_write_pmr; + break; + default: + return 0; + } + + vmcr = __vgic_v3_read_vmcr(); + rt = kvm_vcpu_sys_get_rt(vcpu); + fn(vcpu, vmcr, rt); + + __kvm_skip_instr(vcpu); + + return 1; +} diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c new file mode 100644 index 000000000000..550dfa3e53cd --- /dev/null +++ b/arch/arm64/kvm/hypercalls.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Arm Ltd. + +#include +#include + +#include + +#include +#include + +int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) +{ + u32 func_id = smccc_get_function(vcpu); + long val = SMCCC_RET_NOT_SUPPORTED; + u32 feature; + gpa_t gpa; + + switch (func_id) { + case ARM_SMCCC_VERSION_FUNC_ID: + val = ARM_SMCCC_VERSION_1_1; + break; + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: + feature = smccc_get_arg1(vcpu); + switch (feature) { + case ARM_SMCCC_ARCH_WORKAROUND_1: + switch (kvm_arm_harden_branch_predictor()) { + case KVM_BP_HARDEN_UNKNOWN: + break; + case KVM_BP_HARDEN_WA_NEEDED: + val = SMCCC_RET_SUCCESS; + break; + case KVM_BP_HARDEN_NOT_REQUIRED: + val = SMCCC_RET_NOT_REQUIRED; + break; + } + break; + case ARM_SMCCC_ARCH_WORKAROUND_2: + switch (kvm_arm_have_ssbd()) { + case KVM_SSBD_FORCE_DISABLE: + case KVM_SSBD_UNKNOWN: + break; + case KVM_SSBD_KERNEL: + val = SMCCC_RET_SUCCESS; + break; + case KVM_SSBD_FORCE_ENABLE: + case KVM_SSBD_MITIGATED: + val = SMCCC_RET_NOT_REQUIRED; + break; + } + break; + case ARM_SMCCC_HV_PV_TIME_FEATURES: + val = SMCCC_RET_SUCCESS; + break; + } + break; + case ARM_SMCCC_HV_PV_TIME_FEATURES: + val = kvm_hypercall_pv_features(vcpu); + break; + case ARM_SMCCC_HV_PV_TIME_ST: + gpa = kvm_init_stolen_time(vcpu); + if (gpa != GPA_INVALID) + val = gpa; + break; + default: + return kvm_psci_call(vcpu); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c new file mode 100644 index 000000000000..aedfcff99ac5 --- /dev/null +++ b/arch/arm64/kvm/mmio.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall + */ + +#include +#include +#include + +#include "trace.h" + +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) +{ + void *datap = NULL; + union { + u8 byte; + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + tmp.byte = data; + datap = &tmp.byte; + break; + case 2: + tmp.hword = data; + datap = &tmp.hword; + break; + case 4: + tmp.word = data; + datap = &tmp.word; + break; + case 8: + tmp.dword = data; + datap = &tmp.dword; + break; + } + + memcpy(buf, datap, len); +} + +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) +{ + unsigned long data = 0; + union { + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + data = *(u8 *)buf; + break; + case 2: + memcpy(&tmp.hword, buf, len); + data = tmp.hword; + break; + case 4: + memcpy(&tmp.word, buf, len); + data = tmp.word; + break; + case 8: + memcpy(&tmp.dword, buf, len); + data = tmp.dword; + break; + } + + return data; +} + +/** + * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation + * or in-kernel IO emulation + * + * @vcpu: The VCPU pointer + * @run: The VCPU run struct containing the mmio data + */ +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + unsigned long data; + unsigned int len; + int mask; + + /* Detect an already handled MMIO return */ + if (unlikely(!vcpu->mmio_needed)) + return 0; + + vcpu->mmio_needed = 0; + + if (!kvm_vcpu_dabt_iswrite(vcpu)) { + len = kvm_vcpu_dabt_get_as(vcpu); + data = kvm_mmio_read_buf(run->mmio.data, len); + + if (kvm_vcpu_dabt_issext(vcpu) && + len < sizeof(unsigned long)) { + mask = 1U << ((len * 8) - 1); + data = (data ^ mask) - mask; + } + + if (!kvm_vcpu_dabt_issf(vcpu)) + data = data & 0xffffffff; + + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, + &data); + data = vcpu_data_host_to_guest(vcpu, data, len); + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); + } + + /* + * The MMIO instruction is emulated and should not be re-executed + * in the guest. + */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + + return 0; +} + +int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, + phys_addr_t fault_ipa) +{ + unsigned long data; + unsigned long rt; + int ret; + bool is_write; + int len; + u8 data_buf[8]; + + /* + * No valid syndrome? Ask userspace for help if it has + * voluntered to do so, and bail out otherwise. + */ + if (!kvm_vcpu_dabt_isvalid(vcpu)) { + if (vcpu->kvm->arch.return_nisv_io_abort_to_user) { + run->exit_reason = KVM_EXIT_ARM_NISV; + run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); + run->arm_nisv.fault_ipa = fault_ipa; + return 0; + } + + kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n"); + return -ENOSYS; + } + + /* Page table accesses IO mem: tell guest to fix its TTBR */ + if (kvm_vcpu_dabt_iss1tw(vcpu)) { + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + return 1; + } + + /* + * Prepare MMIO operation. First decode the syndrome data we get + * from the CPU. Then try if some in-kernel emulation feels + * responsible, otherwise let user space do its magic. + */ + is_write = kvm_vcpu_dabt_iswrite(vcpu); + len = kvm_vcpu_dabt_get_as(vcpu); + rt = kvm_vcpu_dabt_get_rd(vcpu); + + if (is_write) { + data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), + len); + + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); + kvm_mmio_write_buf(data_buf, len, data); + + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); + } else { + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, + fault_ipa, NULL); + + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); + } + + /* Now prepare kvm_run for the potential return to userland. */ + run->mmio.is_write = is_write; + run->mmio.phys_addr = fault_ipa; + run->mmio.len = len; + vcpu->mmio_needed = 1; + + if (!ret) { + /* We handled the access successfully in the kernel. */ + if (!is_write) + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_kernel++; + kvm_handle_mmio_return(vcpu, run); + return 1; + } + + if (is_write) + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_user++; + run->exit_reason = KVM_EXIT_MMIO; + return 0; +} diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c new file mode 100644 index 000000000000..e3b9ee268823 --- /dev/null +++ b/arch/arm64/kvm/mmu.c @@ -0,0 +1,2447 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static pgd_t *boot_hyp_pgd; +static pgd_t *hyp_pgd; +static pgd_t *merged_hyp_pgd; +static DEFINE_MUTEX(kvm_hyp_pgd_mutex); + +static unsigned long hyp_idmap_start; +static unsigned long hyp_idmap_end; +static phys_addr_t hyp_idmap_vector; + +static unsigned long io_map_base; + +#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) + +#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) +#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) + +static bool is_iomap(unsigned long flags) +{ + return flags & KVM_S2PTE_FLAG_IS_IOMAP; +} + +static bool memslot_is_logging(struct kvm_memory_slot *memslot) +{ + return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); +} + +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + +static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) +{ + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); +} + +/* + * D-Cache management functions. They take the page table entries by + * value, as they are flushing the cache using the kernel mapping (or + * kmap on 32bit). + */ +static void kvm_flush_dcache_pte(pte_t pte) +{ + __kvm_flush_dcache_pte(pte); +} + +static void kvm_flush_dcache_pmd(pmd_t pmd) +{ + __kvm_flush_dcache_pmd(pmd); +} + +static void kvm_flush_dcache_pud(pud_t pud) +{ + __kvm_flush_dcache_pud(pud); +} + +static bool kvm_is_device_pfn(unsigned long pfn) +{ + return !pfn_valid(pfn); +} + +/** + * stage2_dissolve_pmd() - clear and flush huge PMD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pmd: pmd pointer for IPA + * + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. + */ +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) +{ + if (!pmd_thp_or_huge(*pmd)) + return; + + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pmd)); +} + +/** + * stage2_dissolve_pud() - clear and flush huge PUD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pud: pud pointer for IPA + * + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. + */ +static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) +{ + if (!stage2_pud_huge(kvm, *pudp)) + return; + + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pudp)); +} + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + int min, int max) +{ + void *page; + + BUG_ON(max > KVM_NR_MEM_OBJS); + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < max) { + page = (void *)__get_free_page(GFP_PGTABLE_USER); + if (!page) + return -ENOMEM; + cache->objects[cache->nobjs++] = page; + } + return 0; +} + +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + free_page((unsigned long)mc->objects[--mc->nobjs]); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) +{ + void *p; + + BUG_ON(!mc || !mc->nobjs); + p = mc->objects[--mc->nobjs]; + return p; +} + +static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) +{ + pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); + stage2_pgd_clear(kvm, pgd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + stage2_pud_free(kvm, pud_table); + put_page(virt_to_page(pgd)); +} + +static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) +{ + pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); + VM_BUG_ON(stage2_pud_huge(kvm, *pud)); + stage2_pud_clear(kvm, pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + stage2_pmd_free(kvm, pmd_table); + put_page(virt_to_page(pud)); +} + +static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) +{ + pte_t *pte_table = pte_offset_kernel(pmd, 0); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + free_page((unsigned long)pte_table); + put_page(virt_to_page(pmd)); +} + +static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) +{ + WRITE_ONCE(*ptep, new_pte); + dsb(ishst); +} + +static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) +{ + WRITE_ONCE(*pmdp, new_pmd); + dsb(ishst); +} + +static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) +{ + kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); +} + +static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) +{ + WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); + dsb(ishst); +} + +static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) +{ + WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); + dsb(ishst); +} + +/* + * Unmapping vs dcache management: + * + * If a guest maps certain memory pages as uncached, all writes will + * bypass the data cache and go directly to RAM. However, the CPUs + * can still speculate reads (not writes) and fill cache lines with + * data. + * + * Those cache lines will be *clean* cache lines though, so a + * clean+invalidate operation is equivalent to an invalidate + * operation, because no cache lines are marked dirty. + * + * Those clean cache lines could be filled prior to an uncached write + * by the guest, and the cache coherent IO subsystem would therefore + * end up writing old data to disk. + * + * This is why right after unmapping a page/section and invalidating + * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure + * the IO subsystem will never hit in the cache. + * + * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as + * we then fully enforce cacheability of RAM, no matter what the guest + * does. + */ +static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t start_addr = addr; + pte_t *pte, *start_pte; + + start_pte = pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + pte_t old_pte = *pte; + + kvm_set_pte(pte, __pte(0)); + kvm_tlb_flush_vmid_ipa(kvm, addr); + + /* No need to invalidate the cache for device mappings */ + if (!kvm_is_device_pfn(pte_pfn(old_pte))) + kvm_flush_dcache_pte(old_pte); + + put_page(virt_to_page(pte)); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (stage2_pte_table_empty(kvm, start_pte)) + clear_stage2_pmd_entry(kvm, pmd, start_addr); +} + +static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next, start_addr = addr; + pmd_t *pmd, *start_pmd; + + start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); + do { + next = stage2_pmd_addr_end(kvm, addr, end); + if (!pmd_none(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { + pmd_t old_pmd = *pmd; + + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + + kvm_flush_dcache_pmd(old_pmd); + + put_page(virt_to_page(pmd)); + } else { + unmap_stage2_ptes(kvm, pmd, addr, next); + } + } + } while (pmd++, addr = next, addr != end); + + if (stage2_pmd_table_empty(kvm, start_pmd)) + clear_stage2_pud_entry(kvm, pud, start_addr); +} + +static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next, start_addr = addr; + pud_t *pud, *start_pud; + + start_pud = pud = stage2_pud_offset(kvm, pgd, addr); + do { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) { + pud_t old_pud = *pud; + + stage2_pud_clear(kvm, pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + kvm_flush_dcache_pud(old_pud); + put_page(virt_to_page(pud)); + } else { + unmap_stage2_pmds(kvm, pud, addr, next); + } + } + } while (pud++, addr = next, addr != end); + + if (stage2_pud_table_empty(kvm, start_pud)) + clear_stage2_pgd_entry(kvm, pgd, start_addr); +} + +/** + * unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * @kvm: The VM pointer + * @start: The intermediate physical base address of the range to unmap + * @size: The size of the area to unmap + * + * Clear a range of stage-2 mappings, lowering the various ref-counts. Must + * be called while holding mmu_lock (unless for freeing the stage2 pgd before + * destroying the VM), otherwise another faulting VCPU may come in and mess + * with things behind our backs. + */ +static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) +{ + pgd_t *pgd; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; + + assert_spin_locked(&kvm->mmu_lock); + WARN_ON(size & ~PAGE_MASK); + + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + do { + /* + * Make sure the page table is still active, as another thread + * could have possibly freed the page table, while we released + * the lock. + */ + if (!READ_ONCE(kvm->arch.pgd)) + break; + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) + unmap_stage2_puds(kvm, pgd, addr, next); + /* + * If the range is too large, release the kvm->mmu_lock + * to prevent starvation and lockup detector warnings. + */ + if (next != end) + cond_resched_lock(&kvm->mmu_lock); + } while (pgd++, addr = next, addr != end); +} + +static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, + phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) + kvm_flush_dcache_pte(*pte); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = stage2_pmd_offset(kvm, pud, addr); + do { + next = stage2_pmd_addr_end(kvm, addr, end); + if (!pmd_none(*pmd)) { + if (pmd_thp_or_huge(*pmd)) + kvm_flush_dcache_pmd(*pmd); + else + stage2_flush_ptes(kvm, pmd, addr, next); + } + } while (pmd++, addr = next, addr != end); +} + +static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + pud_t *pud; + phys_addr_t next; + + pud = stage2_pud_offset(kvm, pgd, addr); + do { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) + kvm_flush_dcache_pud(*pud); + else + stage2_flush_pmds(kvm, pud, addr, next); + } + } while (pud++, addr = next, addr != end); +} + +static void stage2_flush_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t end = addr + PAGE_SIZE * memslot->npages; + phys_addr_t next; + pgd_t *pgd; + + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + do { + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) + stage2_flush_puds(kvm, pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +/** + * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 + * @kvm: The struct kvm pointer + * + * Go through the stage 2 page tables and invalidate any cache lines + * backing memory already mapped to the VM. + */ +static void stage2_flush_vm(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) + stage2_flush_memslot(kvm, memslot); + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); +} + +static void clear_hyp_pgd_entry(pgd_t *pgd) +{ + pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); + pgd_clear(pgd); + pud_free(NULL, pud_table); + put_page(virt_to_page(pgd)); +} + +static void clear_hyp_pud_entry(pud_t *pud) +{ + pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); + VM_BUG_ON(pud_huge(*pud)); + pud_clear(pud); + pmd_free(NULL, pmd_table); + put_page(virt_to_page(pud)); +} + +static void clear_hyp_pmd_entry(pmd_t *pmd) +{ + pte_t *pte_table = pte_offset_kernel(pmd, 0); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); + pmd_clear(pmd); + pte_free_kernel(NULL, pte_table); + put_page(virt_to_page(pmd)); +} + +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte, *start_pte; + + start_pte = pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + kvm_set_pte(pte, __pte(0)); + put_page(virt_to_page(pte)); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (hyp_pte_table_empty(start_pte)) + clear_hyp_pmd_entry(pmd); +} + +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pmd_t *pmd, *start_pmd; + + start_pmd = pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + /* Hyp doesn't use huge pmds */ + if (!pmd_none(*pmd)) + unmap_hyp_ptes(pmd, addr, next); + } while (pmd++, addr = next, addr != end); + + if (hyp_pmd_table_empty(start_pmd)) + clear_hyp_pud_entry(pud); +} + +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pud_t *pud, *start_pud; + + start_pud = pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + /* Hyp doesn't use huge puds */ + if (!pud_none(*pud)) + unmap_hyp_pmds(pud, addr, next); + } while (pud++, addr = next, addr != end); + + if (hyp_pud_table_empty(start_pud)) + clear_hyp_pgd_entry(pgd); +} + +static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) +{ + return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); +} + +static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, + phys_addr_t start, u64 size) +{ + pgd_t *pgd; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; + + /* + * We don't unmap anything from HYP, except at the hyp tear down. + * Hence, we don't have to invalidate the TLBs here. + */ + pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); + do { + next = pgd_addr_end(addr, end); + if (!pgd_none(*pgd)) + unmap_hyp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) +{ + __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); +} + +static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) +{ + __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); +} + +/** + * free_hyp_pgds - free Hyp-mode page tables + * + * Assumes hyp_pgd is a page table used strictly in Hyp-mode and + * therefore contains either mappings in the kernel memory area (above + * PAGE_OFFSET), or device mappings in the idmap range. + * + * boot_hyp_pgd should only map the idmap range, and is only used in + * the extended idmap case. + */ +void free_hyp_pgds(void) +{ + pgd_t *id_pgd; + + mutex_lock(&kvm_hyp_pgd_mutex); + + id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; + + if (id_pgd) { + /* In case we never called hyp_mmu_init() */ + if (!io_map_base) + io_map_base = hyp_idmap_start; + unmap_hyp_idmap_range(id_pgd, io_map_base, + hyp_idmap_start + PAGE_SIZE - io_map_base); + } + + if (boot_hyp_pgd) { + free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); + boot_hyp_pgd = NULL; + } + + if (hyp_pgd) { + unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), + (uintptr_t)high_memory - PAGE_OFFSET); + + free_pages((unsigned long)hyp_pgd, hyp_pgd_order); + hyp_pgd = NULL; + } + if (merged_hyp_pgd) { + clear_page(merged_hyp_pgd); + free_page((unsigned long)merged_hyp_pgd); + merged_hyp_pgd = NULL; + } + + mutex_unlock(&kvm_hyp_pgd_mutex); +} + +static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + pte_t *pte; + unsigned long addr; + + addr = start; + do { + pte = pte_offset_kernel(pmd, addr); + kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); + get_page(virt_to_page(pte)); + pfn++; + } while (addr += PAGE_SIZE, addr != end); +} + +static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + pmd_t *pmd; + pte_t *pte; + unsigned long addr, next; + + addr = start; + do { + pmd = pmd_offset(pud, addr); + + BUG_ON(pmd_sect(*pmd)); + + if (pmd_none(*pmd)) { + pte = pte_alloc_one_kernel(NULL); + if (!pte) { + kvm_err("Cannot allocate Hyp pte\n"); + return -ENOMEM; + } + kvm_pmd_populate(pmd, pte); + get_page(virt_to_page(pmd)); + } + + next = pmd_addr_end(addr, end); + + create_hyp_pte_mappings(pmd, addr, next, pfn, prot); + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); + + return 0; +} + +static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + pud_t *pud; + pmd_t *pmd; + unsigned long addr, next; + int ret; + + addr = start; + do { + pud = pud_offset(pgd, addr); + + if (pud_none_or_clear_bad(pud)) { + pmd = pmd_alloc_one(NULL, addr); + if (!pmd) { + kvm_err("Cannot allocate Hyp pmd\n"); + return -ENOMEM; + } + kvm_pud_populate(pud, pmd); + get_page(virt_to_page(pud)); + } + + next = pud_addr_end(addr, end); + ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); + if (ret) + return ret; + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); + + return 0; +} + +static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, + unsigned long start, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + unsigned long addr, next; + int err = 0; + + mutex_lock(&kvm_hyp_pgd_mutex); + addr = start & PAGE_MASK; + end = PAGE_ALIGN(end); + do { + pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); + + if (pgd_none(*pgd)) { + pud = pud_alloc_one(NULL, addr); + if (!pud) { + kvm_err("Cannot allocate Hyp pud\n"); + err = -ENOMEM; + goto out; + } + kvm_pgd_populate(pgd, pud); + get_page(virt_to_page(pgd)); + } + + next = pgd_addr_end(addr, end); + err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); + if (err) + goto out; + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); +out: + mutex_unlock(&kvm_hyp_pgd_mutex); + return err; +} + +static phys_addr_t kvm_kaddr_to_phys(void *kaddr) +{ + if (!is_vmalloc_addr(kaddr)) { + BUG_ON(!virt_addr_valid(kaddr)); + return __pa(kaddr); + } else { + return page_to_phys(vmalloc_to_page(kaddr)) + + offset_in_page(kaddr); + } +} + +/** + * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode + * @from: The virtual kernel start address of the range + * @to: The virtual kernel end address of the range (exclusive) + * @prot: The protection to be applied to this range + * + * The same virtual address as the kernel virtual address is also used + * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying + * physical pages. + */ +int create_hyp_mappings(void *from, void *to, pgprot_t prot) +{ + phys_addr_t phys_addr; + unsigned long virt_addr; + unsigned long start = kern_hyp_va((unsigned long)from); + unsigned long end = kern_hyp_va((unsigned long)to); + + if (is_kernel_in_hyp_mode()) + return 0; + + start = start & PAGE_MASK; + end = PAGE_ALIGN(end); + + for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { + int err; + + phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); + err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, + virt_addr, virt_addr + PAGE_SIZE, + __phys_to_pfn(phys_addr), + prot); + if (err) + return err; + } + + return 0; +} + +static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, + unsigned long *haddr, pgprot_t prot) +{ + pgd_t *pgd = hyp_pgd; + unsigned long base; + int ret = 0; + + mutex_lock(&kvm_hyp_pgd_mutex); + + /* + * This assumes that we we have enough space below the idmap + * page to allocate our VAs. If not, the check below will + * kick. A potential alternative would be to detect that + * overflow and switch to an allocation above the idmap. + * + * The allocated size is always a multiple of PAGE_SIZE. + */ + size = PAGE_ALIGN(size + offset_in_page(phys_addr)); + base = io_map_base - size; + + /* + * Verify that BIT(VA_BITS - 1) hasn't been flipped by + * allocating the new area, as it would indicate we've + * overflowed the idmap/IO address range. + */ + if ((base ^ io_map_base) & BIT(VA_BITS - 1)) + ret = -ENOMEM; + else + io_map_base = base; + + mutex_unlock(&kvm_hyp_pgd_mutex); + + if (ret) + goto out; + + if (__kvm_cpu_uses_extended_idmap()) + pgd = boot_hyp_pgd; + + ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), + base, base + size, + __phys_to_pfn(phys_addr), prot); + if (ret) + goto out; + + *haddr = base + offset_in_page(phys_addr); + +out: + return ret; +} + +/** + * create_hyp_io_mappings - Map IO into both kernel and HYP + * @phys_addr: The physical start address which gets mapped + * @size: Size of the region being mapped + * @kaddr: Kernel VA for this mapping + * @haddr: HYP VA for this mapping + */ +int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, + void __iomem **kaddr, + void __iomem **haddr) +{ + unsigned long addr; + int ret; + + *kaddr = ioremap(phys_addr, size); + if (!*kaddr) + return -ENOMEM; + + if (is_kernel_in_hyp_mode()) { + *haddr = *kaddr; + return 0; + } + + ret = __create_hyp_private_mapping(phys_addr, size, + &addr, PAGE_HYP_DEVICE); + if (ret) { + iounmap(*kaddr); + *kaddr = NULL; + *haddr = NULL; + return ret; + } + + *haddr = (void __iomem *)addr; + return 0; +} + +/** + * create_hyp_exec_mappings - Map an executable range into HYP + * @phys_addr: The physical start address which gets mapped + * @size: Size of the region being mapped + * @haddr: HYP VA for this mapping + */ +int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, + void **haddr) +{ + unsigned long addr; + int ret; + + BUG_ON(is_kernel_in_hyp_mode()); + + ret = __create_hyp_private_mapping(phys_addr, size, + &addr, PAGE_HYP_EXEC); + if (ret) { + *haddr = NULL; + return ret; + } + + *haddr = (void *)addr; + return 0; +} + +/** + * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. + * @kvm: The KVM struct pointer for the VM. + * + * Allocates only the stage-2 HW PGD level table(s) of size defined by + * stage2_pgd_size(kvm). + * + * Note we don't need locking here as this is only called when the VM is + * created, which can only be done once. + */ +int kvm_alloc_stage2_pgd(struct kvm *kvm) +{ + phys_addr_t pgd_phys; + pgd_t *pgd; + + if (kvm->arch.pgd != NULL) { + kvm_err("kvm_arch already initialized?\n"); + return -EINVAL; + } + + /* Allocate the HW PGD, making sure that each page gets its own refcount */ + pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); + if (!pgd) + return -ENOMEM; + + pgd_phys = virt_to_phys(pgd); + if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) + return -EINVAL; + + kvm->arch.pgd = pgd; + kvm->arch.pgd_phys = pgd_phys; + return 0; +} + +static void stage2_unmap_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + hva_t hva = memslot->userspace_addr; + phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t size = PAGE_SIZE * memslot->npages; + hva_t reg_end = hva + size; + + /* + * A memory region could potentially cover multiple VMAs, and any holes + * between them, so iterate over all of them to find out if we should + * unmap any of them. + * + * +--------------------------------------------+ + * +---------------+----------------+ +----------------+ + * | : VMA 1 | VMA 2 | | VMA 3 : | + * +---------------+----------------+ +----------------+ + * | memory region | + * +--------------------------------------------+ + */ + do { + struct vm_area_struct *vma = find_vma(current->mm, hva); + hva_t vm_start, vm_end; + + if (!vma || vma->vm_start >= reg_end) + break; + + /* + * Take the intersection of this VMA with the memory region + */ + vm_start = max(hva, vma->vm_start); + vm_end = min(reg_end, vma->vm_end); + + if (!(vma->vm_flags & VM_PFNMAP)) { + gpa_t gpa = addr + (vm_start - memslot->userspace_addr); + unmap_stage2_range(kvm, gpa, vm_end - vm_start); + } + hva = vm_end; + } while (hva < reg_end); +} + +/** + * stage2_unmap_vm - Unmap Stage-2 RAM mappings + * @kvm: The struct kvm pointer + * + * Go through the memregions and unmap any reguler RAM + * backing memory already mapped to the VM. + */ +void stage2_unmap_vm(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + down_read(¤t->mm->mmap_sem); + spin_lock(&kvm->mmu_lock); + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) + stage2_unmap_memslot(kvm, memslot); + + spin_unlock(&kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); + srcu_read_unlock(&kvm->srcu, idx); +} + +/** + * kvm_free_stage2_pgd - free all stage-2 tables + * @kvm: The KVM struct pointer for the VM. + * + * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all + * underlying level-2 and level-3 tables before freeing the actual level-1 table + * and setting the struct pointer to NULL. + */ +void kvm_free_stage2_pgd(struct kvm *kvm) +{ + void *pgd = NULL; + + spin_lock(&kvm->mmu_lock); + if (kvm->arch.pgd) { + unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); + pgd = READ_ONCE(kvm->arch.pgd); + kvm->arch.pgd = NULL; + kvm->arch.pgd_phys = 0; + } + spin_unlock(&kvm->mmu_lock); + + /* Free the HW pgd, one page at a time */ + if (pgd) + free_pages_exact(pgd, stage2_pgd_size(kvm)); +} + +static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + if (stage2_pgd_none(kvm, *pgd)) { + if (!cache) + return NULL; + pud = mmu_memory_cache_alloc(cache); + stage2_pgd_populate(kvm, pgd, pud); + get_page(virt_to_page(pgd)); + } + + return stage2_pud_offset(kvm, pgd, addr); +} + +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = stage2_get_pud(kvm, cache, addr); + if (!pud || stage2_pud_huge(kvm, *pud)) + return NULL; + + if (stage2_pud_none(kvm, *pud)) { + if (!cache) + return NULL; + pmd = mmu_memory_cache_alloc(cache); + stage2_pud_populate(kvm, pud, pmd); + get_page(virt_to_page(pud)); + } + + return stage2_pmd_offset(kvm, pud, addr); +} + +static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache + *cache, phys_addr_t addr, const pmd_t *new_pmd) +{ + pmd_t *pmd, old_pmd; + +retry: + pmd = stage2_get_pmd(kvm, cache, addr); + VM_BUG_ON(!pmd); + + old_pmd = *pmd; + /* + * Multiple vcpus faulting on the same PMD entry, can + * lead to them sequentially updating the PMD with the + * same value. Following the break-before-make + * (pmd_clear() followed by tlb_flush()) process can + * hinder forward progress due to refaults generated + * on missing translations. + * + * Skip updating the page table if the entry is + * unchanged. + */ + if (pmd_val(old_pmd) == pmd_val(*new_pmd)) + return 0; + + if (pmd_present(old_pmd)) { + /* + * If we already have PTE level mapping for this block, + * we must unmap it to avoid inconsistent TLB state and + * leaking the table page. We could end up in this situation + * if the memory slot was marked for dirty logging and was + * reverted, leaving PTE level mappings for the pages accessed + * during the period. So, unmap the PTE level mapping for this + * block and retry, as we could have released the upper level + * table in the process. + * + * Normal THP split/merge follows mmu_notifier callbacks and do + * get handled accordingly. + */ + if (!pmd_thp_or_huge(old_pmd)) { + unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); + goto retry; + } + /* + * Mapping in huge pages should only happen through a + * fault. If a page is merged into a transparent huge + * page, the individual subpages of that huge page + * should be unmapped through MMU notifiers before we + * get here. + * + * Merging of CompoundPages is not supported; they + * should become splitting first, unmapped, merged, + * and mapped back in on-demand. + */ + WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pmd)); + } + + kvm_set_pmd(pmd, *new_pmd); + return 0; +} + +static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pud_t *new_pudp) +{ + pud_t *pudp, old_pud; + +retry: + pudp = stage2_get_pud(kvm, cache, addr); + VM_BUG_ON(!pudp); + + old_pud = *pudp; + + /* + * A large number of vcpus faulting on the same stage 2 entry, + * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). + * Skip updating the page tables if there is no change. + */ + if (pud_val(old_pud) == pud_val(*new_pudp)) + return 0; + + if (stage2_pud_present(kvm, old_pud)) { + /* + * If we already have table level mapping for this block, unmap + * the range for this block and retry. + */ + if (!stage2_pud_huge(kvm, old_pud)) { + unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); + goto retry; + } + + WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pudp)); + } + + kvm_set_pud(pudp, *new_pudp); + return 0; +} + +/* + * stage2_get_leaf_entry - walk the stage2 VM page tables and return + * true if a valid and present leaf-entry is found. A pointer to the + * leaf-entry is returned in the appropriate level variable - pudpp, + * pmdpp, ptepp. + */ +static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, + pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) +{ + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + *pudpp = NULL; + *pmdpp = NULL; + *ptepp = NULL; + + pudp = stage2_get_pud(kvm, NULL, addr); + if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) + return false; + + if (stage2_pud_huge(kvm, *pudp)) { + *pudpp = pudp; + return true; + } + + pmdp = stage2_pmd_offset(kvm, pudp, addr); + if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) + return false; + + if (pmd_thp_or_huge(*pmdp)) { + *pmdpp = pmdp; + return true; + } + + ptep = pte_offset_kernel(pmdp, addr); + if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) + return false; + + *ptepp = ptep; + return true; +} + +static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +{ + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + bool found; + + found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); + if (!found) + return false; + + if (pudp) + return kvm_s2pud_exec(pudp); + else if (pmdp) + return kvm_s2pmd_exec(pmdp); + else + return kvm_s2pte_exec(ptep); +} + +static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pte_t *new_pte, + unsigned long flags) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte, old_pte; + bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; + bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; + + VM_BUG_ON(logging_active && !cache); + + /* Create stage-2 page table mapping - Levels 0 and 1 */ + pud = stage2_get_pud(kvm, cache, addr); + if (!pud) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* + * While dirty page logging - dissolve huge PUD, then continue + * on to allocate page. + */ + if (logging_active) + stage2_dissolve_pud(kvm, addr, pud); + + if (stage2_pud_none(kvm, *pud)) { + if (!cache) + return 0; /* ignore calls from kvm_set_spte_hva */ + pmd = mmu_memory_cache_alloc(cache); + stage2_pud_populate(kvm, pud, pmd); + get_page(virt_to_page(pud)); + } + + pmd = stage2_pmd_offset(kvm, pud, addr); + if (!pmd) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* + * While dirty page logging - dissolve huge PMD, then continue on to + * allocate page. + */ + if (logging_active) + stage2_dissolve_pmd(kvm, addr, pmd); + + /* Create stage-2 page mappings - Level 2 */ + if (pmd_none(*pmd)) { + if (!cache) + return 0; /* ignore calls from kvm_set_spte_hva */ + pte = mmu_memory_cache_alloc(cache); + kvm_pmd_populate(pmd, pte); + get_page(virt_to_page(pmd)); + } + + pte = pte_offset_kernel(pmd, addr); + + if (iomap && pte_present(*pte)) + return -EFAULT; + + /* Create 2nd stage page table mapping - Level 3 */ + old_pte = *pte; + if (pte_present(old_pte)) { + /* Skip page table update if there is no change */ + if (pte_val(old_pte) == pte_val(*new_pte)) + return 0; + + kvm_set_pte(pte, __pte(0)); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pte)); + } + + kvm_set_pte(pte, *new_pte); + return 0; +} + +#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + if (pte_young(*pte)) { + *pte = pte_mkold(*pte); + return 1; + } + return 0; +} +#else +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + return __ptep_test_and_clear_young(pte); +} +#endif + +static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pmd); +} + +static int stage2_pudp_test_and_clear_young(pud_t *pud) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pud); +} + +/** + * kvm_phys_addr_ioremap - map a device range to guest IPA + * + * @kvm: The KVM pointer + * @guest_ipa: The IPA at which to insert the mapping + * @pa: The physical address of the device + * @size: The size of the mapping + */ +int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, + phys_addr_t pa, unsigned long size, bool writable) +{ + phys_addr_t addr, end; + int ret = 0; + unsigned long pfn; + struct kvm_mmu_memory_cache cache = { 0, }; + + end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; + pfn = __phys_to_pfn(pa); + + for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { + pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); + + if (writable) + pte = kvm_s2pte_mkwrite(pte); + + ret = mmu_topup_memory_cache(&cache, + kvm_mmu_cache_min_pages(kvm), + KVM_NR_MEM_OBJS); + if (ret) + goto out; + spin_lock(&kvm->mmu_lock); + ret = stage2_set_pte(kvm, &cache, addr, &pte, + KVM_S2PTE_FLAG_IS_IOMAP); + spin_unlock(&kvm->mmu_lock); + if (ret) + goto out; + + pfn++; + } + +out: + mmu_free_memory_cache(&cache); + return ret; +} + +static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) +{ + kvm_pfn_t pfn = *pfnp; + gfn_t gfn = *ipap >> PAGE_SHIFT; + + if (kvm_is_transparent_hugepage(pfn)) { + unsigned long mask; + /* + * The address we faulted on is backed by a transparent huge + * page. However, because we map the compound huge page and + * not the individual tail page, we need to transfer the + * refcount to the head page. We have to be careful that the + * THP doesn't start to split while we are adjusting the + * refcounts. + * + * We are sure this doesn't happen, because mmu_notifier_retry + * was successful and we are holding the mmu_lock, so if this + * THP is trying to split, it will be blocked in the mmu + * notifier before touching any of the pages, specifically + * before being able to call __split_huge_page_refcount(). + * + * We can therefore safely transfer the refcount from PG_tail + * to PG_head and switch the pfn from a tail page to the head + * page accordingly. + */ + mask = PTRS_PER_PMD - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + *ipap &= PMD_MASK; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + kvm_get_pfn(pfn); + *pfnp = pfn; + } + + return true; + } + + return false; +} + +/** + * stage2_wp_ptes - write protect PMD range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte)) + kvm_set_s2pte_readonly(pte); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmds - write protect PUD range + * kvm: kvm instance for the VM + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = stage2_pmd_offset(kvm, pud, addr); + + do { + next = stage2_pmd_addr_end(kvm, addr, end); + if (!pmd_none(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { + if (!kvm_s2pmd_readonly(pmd)) + kvm_set_s2pmd_readonly(pmd); + } else { + stage2_wp_ptes(pmd, addr, next); + } + } + } while (pmd++, addr = next, addr != end); +} + +/** + * stage2_wp_puds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + pud_t *pud; + phys_addr_t next; + + pud = stage2_pud_offset(kvm, pgd, addr); + do { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) { + if (!kvm_s2pud_readonly(pud)) + kvm_set_s2pud_readonly(pud); + } else { + stage2_wp_pmds(kvm, pud, addr, next); + } + } + } while (pud++, addr = next, addr != end); +} + +/** + * stage2_wp_range() - write protect stage2 memory region range + * @kvm: The KVM pointer + * @addr: Start address of range + * @end: End address of range + */ +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + pgd_t *pgd; + phys_addr_t next; + + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + do { + /* + * Release kvm_mmu_lock periodically if the memory region is + * large. Otherwise, we may see kernel panics with + * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, + * CONFIG_LOCKDEP. Additionally, holding the lock too long + * will also starve other vCPUs. We have to also make sure + * that the page tables are not freed while we released + * the lock. + */ + cond_resched_lock(&kvm->mmu_lock); + if (!READ_ONCE(kvm->arch.pgd)) + break; + next = stage2_pgd_addr_end(kvm, addr, end); + if (stage2_pgd_present(kvm, *pgd)) + stage2_wp_puds(kvm, pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +/** + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to write protect + * + * Called to start logging dirty pages after memory region + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns + * all present PUD, PMD and PTEs are write protected in the memory region. + * Afterwards read of dirty page log can be called. + * + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); + phys_addr_t start, end; + + if (WARN_ON_ONCE(!memslot)) + return; + + start = memslot->base_gfn << PAGE_SHIFT; + end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_wp_range(kvm, start, end); + spin_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); +} + +/** + * kvm_mmu_write_protect_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire kvm_mmu_lock. + */ +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + stage2_wp_range(kvm, start, end); +} + +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * dirty pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + +static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) +{ + __clean_dcache_guest_page(pfn, size); +} + +static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) +{ + __invalidate_icache_guest_page(pfn, size); +} + +static void kvm_send_hwpoison_signal(unsigned long address, short lsb) +{ + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); +} + +static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, + unsigned long hva, + unsigned long map_size) +{ + gpa_t gpa_start; + hva_t uaddr_start, uaddr_end; + size_t size; + + size = memslot->npages * PAGE_SIZE; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 + * PMD/PUD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SIZE: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 block | Stage-2 block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva & ~(map_size - 1)) >= uaddr_start && + (hva & ~(map_size - 1)) + map_size <= uaddr_end; +} + +static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_memory_slot *memslot, unsigned long hva, + unsigned long fault_status) +{ + int ret; + bool write_fault, writable, force_pte = false; + bool exec_fault, needs_exec; + unsigned long mmu_seq; + gfn_t gfn = fault_ipa >> PAGE_SHIFT; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + struct vm_area_struct *vma; + short vma_shift; + kvm_pfn_t pfn; + pgprot_t mem_type = PAGE_S2; + bool logging_active = memslot_is_logging(memslot); + unsigned long vma_pagesize, flags = 0; + + write_fault = kvm_is_write_fault(vcpu); + exec_fault = kvm_vcpu_trap_is_iabt(vcpu); + VM_BUG_ON(write_fault && exec_fault); + + if (fault_status == FSC_PERM && !write_fault && !exec_fault) { + kvm_err("Unexpected L2 read permission error\n"); + return -EFAULT; + } + + /* Let's check if we will get back a huge page backed by hugetlbfs */ + down_read(¤t->mm->mmap_sem); + vma = find_vma_intersection(current->mm, hva, hva + 1); + if (unlikely(!vma)) { + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); + up_read(¤t->mm->mmap_sem); + return -EFAULT; + } + + if (is_vm_hugetlb_page(vma)) + vma_shift = huge_page_shift(hstate_vma(vma)); + else + vma_shift = PAGE_SHIFT; + + vma_pagesize = 1ULL << vma_shift; + if (logging_active || + (vma->vm_flags & VM_PFNMAP) || + !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { + force_pte = true; + vma_pagesize = PAGE_SIZE; + } + + /* + * The stage2 has a minimum of 2 level table (For arm64 see + * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can + * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). + * As for PUD huge maps, we must make sure that we have at least + * 3 levels, i.e, PMD is not folded. + */ + if (vma_pagesize == PMD_SIZE || + (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) + gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; + up_read(¤t->mm->mmap_sem); + + /* We need minimum second+third level pages */ + ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), + KVM_NR_MEM_OBJS); + if (ret) + return ret; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + /* + * Ensure the read of mmu_notifier_seq happens before we call + * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk + * the page we just got a reference to gets unmapped before we have a + * chance to grab the mmu_lock, which ensure that if the page gets + * unmapped afterwards, the call to kvm_unmap_hva will take it away + * from us again properly. This smp_rmb() interacts with the smp_wmb() + * in kvm_mmu_notifier_invalidate_. + */ + smp_rmb(); + + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); + if (pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(hva, vma_shift); + return 0; + } + if (is_error_noslot_pfn(pfn)) + return -EFAULT; + + if (kvm_is_device_pfn(pfn)) { + mem_type = PAGE_S2_DEVICE; + flags |= KVM_S2PTE_FLAG_IS_IOMAP; + } else if (logging_active) { + /* + * Faults on pages in a memslot with logging enabled + * should not be mapped with huge pages (it introduces churn + * and performance degradation), so force a pte mapping. + */ + flags |= KVM_S2_FLAG_LOGGING_ACTIVE; + + /* + * Only actually map the page as writable if this was a write + * fault. + */ + if (!write_fault) + writable = false; + } + + if (exec_fault && is_iomap(flags)) + return -ENOEXEC; + + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(kvm, mmu_seq)) + goto out_unlock; + + if (vma_pagesize == PAGE_SIZE && !force_pte) { + /* + * Only PMD_SIZE transparent hugepages(THP) are + * currently supported. This code will need to be + * updated to support other THP sizes. + * + * Make sure the host VA and the guest IPA are sufficiently + * aligned and that the block is contained within the memslot. + */ + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && + transparent_hugepage_adjust(&pfn, &fault_ipa)) + vma_pagesize = PMD_SIZE; + } + + if (writable) + kvm_set_pfn_dirty(pfn); + + if (fault_status != FSC_PERM && !is_iomap(flags)) + clean_dcache_guest_page(pfn, vma_pagesize); + + if (exec_fault) + invalidate_icache_guest_page(pfn, vma_pagesize); + + /* + * If we took an execution fault we have made the + * icache/dcache coherent above and should now let the s2 + * mapping be executable. + * + * Write faults (!exec_fault && FSC_PERM) are orthogonal to + * execute permissions, and we preserve whatever we have. + */ + needs_exec = exec_fault || + (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); + + if (vma_pagesize == PUD_SIZE) { + pud_t new_pud = kvm_pfn_pud(pfn, mem_type); + + new_pud = kvm_pud_mkhuge(new_pud); + if (writable) + new_pud = kvm_s2pud_mkwrite(new_pud); + + if (needs_exec) + new_pud = kvm_s2pud_mkexec(new_pud); + + ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); + } else if (vma_pagesize == PMD_SIZE) { + pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); + + new_pmd = kvm_pmd_mkhuge(new_pmd); + + if (writable) + new_pmd = kvm_s2pmd_mkwrite(new_pmd); + + if (needs_exec) + new_pmd = kvm_s2pmd_mkexec(new_pmd); + + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); + } else { + pte_t new_pte = kvm_pfn_pte(pfn, mem_type); + + if (writable) { + new_pte = kvm_s2pte_mkwrite(new_pte); + mark_page_dirty(kvm, gfn); + } + + if (needs_exec) + new_pte = kvm_s2pte_mkexec(new_pte); + + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); + } + +out_unlock: + spin_unlock(&kvm->mmu_lock); + kvm_set_pfn_accessed(pfn); + kvm_release_pfn_clean(pfn); + return ret; +} + +/* + * Resolve the access fault by making the page young again. + * Note that because the faulting entry is guaranteed not to be + * cached in the TLB, we don't need to invalidate anything. + * Only the HW Access Flag updates are supported for Stage 2 (no DBM), + * so there is no need for atomic (pte|pmd)_mkyoung operations. + */ +static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + kvm_pfn_t pfn; + bool pfn_valid = false; + + trace_kvm_access_fault(fault_ipa); + + spin_lock(&vcpu->kvm->mmu_lock); + + if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) + goto out; + + if (pud) { /* HugeTLB */ + *pud = kvm_s2pud_mkyoung(*pud); + pfn = kvm_pud_pfn(*pud); + pfn_valid = true; + } else if (pmd) { /* THP, HugeTLB */ + *pmd = pmd_mkyoung(*pmd); + pfn = pmd_pfn(*pmd); + pfn_valid = true; + } else { + *pte = pte_mkyoung(*pte); /* Just a page... */ + pfn = pte_pfn(*pte); + pfn_valid = true; + } + +out: + spin_unlock(&vcpu->kvm->mmu_lock); + if (pfn_valid) + kvm_set_pfn_accessed(pfn); +} + +/** + * kvm_handle_guest_abort - handles all 2nd stage aborts + * @vcpu: the VCPU pointer + * @run: the kvm_run structure + * + * Any abort that gets to the host is almost guaranteed to be caused by a + * missing second stage translation table entry, which can mean that either the + * guest simply needs more memory and we must allocate an appropriate page or it + * can mean that the guest tried to access I/O memory, which is emulated by user + * space. The distinction is based on the IPA causing the fault and whether this + * memory region has been registered as standard RAM by user space. + */ +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + unsigned long fault_status; + phys_addr_t fault_ipa; + struct kvm_memory_slot *memslot; + unsigned long hva; + bool is_iabt, write_fault, writable; + gfn_t gfn; + int ret, idx; + + fault_status = kvm_vcpu_trap_get_fault_type(vcpu); + + fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + is_iabt = kvm_vcpu_trap_is_iabt(vcpu); + + /* Synchronous External Abort? */ + if (kvm_vcpu_dabt_isextabt(vcpu)) { + /* + * For RAS the host kernel may handle this abort. + * There is no need to pass the error into the guest. + */ + if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) + return 1; + + if (unlikely(!is_iabt)) { + kvm_inject_vabt(vcpu); + return 1; + } + } + + trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), + kvm_vcpu_get_hfar(vcpu), fault_ipa); + + /* Check the stage-2 fault is trans. fault or write fault */ + if (fault_status != FSC_FAULT && fault_status != FSC_PERM && + fault_status != FSC_ACCESS) { + kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", + kvm_vcpu_trap_get_class(vcpu), + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), + (unsigned long)kvm_vcpu_get_hsr(vcpu)); + return -EFAULT; + } + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + gfn = fault_ipa >> PAGE_SHIFT; + memslot = gfn_to_memslot(vcpu->kvm, gfn); + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); + write_fault = kvm_is_write_fault(vcpu); + if (kvm_is_error_hva(hva) || (write_fault && !writable)) { + if (is_iabt) { + /* Prefetch Abort on I/O address */ + ret = -ENOEXEC; + goto out; + } + + /* + * Check for a cache maintenance operation. Since we + * ended-up here, we know it is outside of any memory + * slot. But we can't find out if that is for a device, + * or if the guest is just being stupid. The only thing + * we know for sure is that this range cannot be cached. + * + * So let's assume that the guest is just being + * cautious, and skip the instruction. + */ + if (kvm_vcpu_dabt_is_cm(vcpu)) { + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + ret = 1; + goto out_unlock; + } + + /* + * The IPA is reported as [MAX:12], so we need to + * complement it with the bottom 12 bits from the + * faulting VA. This is always 12 bits, irrespective + * of the page size. + */ + fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); + ret = io_mem_abort(vcpu, run, fault_ipa); + goto out_unlock; + } + + /* Userspace should not be able to register out-of-bounds IPAs */ + VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); + + if (fault_status == FSC_ACCESS) { + handle_access_fault(vcpu, fault_ipa); + ret = 1; + goto out_unlock; + } + + ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); + if (ret == 0) + ret = 1; +out: + if (ret == -ENOEXEC) { + kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + ret = 1; + } +out_unlock: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} + +static int handle_hva_to_gpa(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, + gpa_t gpa, u64 size, + void *data), + void *data) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int ret = 0; + + slots = kvm_memslots(kvm); + + /* we only care about the pages that the guest sees */ + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gpa; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + + gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; + ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); + } + + return ret; +} + +static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +{ + unmap_stage2_range(kvm, gpa, size); + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end) +{ + if (!kvm->arch.pgd) + return 0; + + trace_kvm_unmap_hva_range(start, end); + handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); + return 0; +} + +static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +{ + pte_t *pte = (pte_t *)data; + + WARN_ON(size != PAGE_SIZE); + /* + * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE + * flag clear because MMU notifiers will have unmapped a huge PMD before + * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * therefore stage2_set_pte() never needs to clear out a huge PMD + * through this calling path. + */ + stage2_set_pte(kvm, NULL, gpa, pte, 0); + return 0; +} + + +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + unsigned long end = hva + PAGE_SIZE; + kvm_pfn_t pfn = pte_pfn(pte); + pte_t stage2_pte; + + if (!kvm->arch.pgd) + return 0; + + trace_kvm_set_spte_hva(hva); + + /* + * We've moved a page around, probably through CoW, so let's treat it + * just like a translation fault and clean the cache to the PoC. + */ + clean_dcache_guest_page(pfn, PAGE_SIZE); + stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); + handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); + + return 0; +} + +static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) + return 0; + + if (pud) + return stage2_pudp_test_and_clear_young(pud); + else if (pmd) + return stage2_pmdp_test_and_clear_young(pmd); + else + return stage2_ptep_test_and_clear_young(pte); +} + +static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) + return 0; + + if (pud) + return kvm_s2pud_young(*pud); + else if (pmd) + return pmd_young(*pmd); + else + return pte_young(*pte); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + if (!kvm->arch.pgd) + return 0; + trace_kvm_age_hva(start, end); + return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.pgd) + return 0; + trace_kvm_test_age_hva(hva); + return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, + kvm_test_age_hva_handler, NULL); +} + +void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); +} + +phys_addr_t kvm_mmu_get_httbr(void) +{ + if (__kvm_cpu_uses_extended_idmap()) + return virt_to_phys(merged_hyp_pgd); + else + return virt_to_phys(hyp_pgd); +} + +phys_addr_t kvm_get_idmap_vector(void) +{ + return hyp_idmap_vector; +} + +static int kvm_map_idmap_text(pgd_t *pgd) +{ + int err; + + /* Create the idmap in the boot page tables */ + err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), + hyp_idmap_start, hyp_idmap_end, + __phys_to_pfn(hyp_idmap_start), + PAGE_HYP_EXEC); + if (err) + kvm_err("Failed to idmap %lx-%lx\n", + hyp_idmap_start, hyp_idmap_end); + + return err; +} + +int kvm_mmu_init(void) +{ + int err; + + hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); + hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); + hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); + hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); + hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); + + /* + * We rely on the linker script to ensure at build time that the HYP + * init code does not cross a page boundary. + */ + BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); + + kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); + kvm_debug("HYP VA range: %lx:%lx\n", + kern_hyp_va(PAGE_OFFSET), + kern_hyp_va((unsigned long)high_memory - 1)); + + if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && + hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && + hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { + /* + * The idmap page is intersecting with the VA space, + * it is not safe to continue further. + */ + kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); + err = -EINVAL; + goto out; + } + + hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); + if (!hyp_pgd) { + kvm_err("Hyp mode PGD not allocated\n"); + err = -ENOMEM; + goto out; + } + + if (__kvm_cpu_uses_extended_idmap()) { + boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + hyp_pgd_order); + if (!boot_hyp_pgd) { + kvm_err("Hyp boot PGD not allocated\n"); + err = -ENOMEM; + goto out; + } + + err = kvm_map_idmap_text(boot_hyp_pgd); + if (err) + goto out; + + merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!merged_hyp_pgd) { + kvm_err("Failed to allocate extra HYP pgd\n"); + goto out; + } + __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, + hyp_idmap_start); + } else { + err = kvm_map_idmap_text(hyp_pgd); + if (err) + goto out; + } + + io_map_base = hyp_idmap_start; + return 0; +out: + free_hyp_pgds(); + return err; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + /* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be be tracked while the + * memory slot is write protected. + */ + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) + kvm_mmu_wp_memory_region(kvm, mem->slot); +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) +{ + hva_t hva = mem->userspace_addr; + hva_t reg_end = hva + mem->memory_size; + bool writable = !(mem->flags & KVM_MEM_READONLY); + int ret = 0; + + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && + change != KVM_MR_FLAGS_ONLY) + return 0; + + /* + * Prevent userspace from creating a memory region outside of the IPA + * space addressable by the KVM guest IPA space. + */ + if (memslot->base_gfn + memslot->npages >= + (kvm_phys_size(kvm) >> PAGE_SHIFT)) + return -EFAULT; + + down_read(¤t->mm->mmap_sem); + /* + * A memory region could potentially cover multiple VMAs, and any holes + * between them, so iterate over all of them to find out if we can map + * any of them right now. + * + * +--------------------------------------------+ + * +---------------+----------------+ +----------------+ + * | : VMA 1 | VMA 2 | | VMA 3 : | + * +---------------+----------------+ +----------------+ + * | memory region | + * +--------------------------------------------+ + */ + do { + struct vm_area_struct *vma = find_vma(current->mm, hva); + hva_t vm_start, vm_end; + + if (!vma || vma->vm_start >= reg_end) + break; + + /* + * Take the intersection of this VMA with the memory region + */ + vm_start = max(hva, vma->vm_start); + vm_end = min(reg_end, vma->vm_end); + + if (vma->vm_flags & VM_PFNMAP) { + gpa_t gpa = mem->guest_phys_addr + + (vm_start - mem->userspace_addr); + phys_addr_t pa; + + pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; + pa += vm_start - vma->vm_start; + + /* IO region dirty page logging not allowed */ + if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { + ret = -EINVAL; + goto out; + } + + ret = kvm_phys_addr_ioremap(kvm, gpa, pa, + vm_end - vm_start, + writable); + if (ret) + break; + } + hva = vm_end; + } while (hva < reg_end); + + if (change == KVM_MR_FLAGS_ONLY) + goto out; + + spin_lock(&kvm->mmu_lock); + if (ret) + unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); + else + stage2_flush_memslot(kvm, memslot); + spin_unlock(&kvm->mmu_lock); +out: + up_read(¤t->mm->mmap_sem); + return ret; +} + +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ +} + +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) +{ +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + kvm_free_stage2_pgd(kvm); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + gpa_t gpa = slot->base_gfn << PAGE_SHIFT; + phys_addr_t size = slot->npages << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + unmap_stage2_range(kvm, gpa, size); + spin_unlock(&kvm->mmu_lock); +} + +/* + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). + * + * Main problems: + * - S/W ops are local to a CPU (not broadcast) + * - We have line migration behind our back (speculation) + * - System caches don't support S/W at all (damn!) + * + * In the face of the above, the best we can do is to try and convert + * S/W ops to VA ops. Because the guest is not allowed to infer the + * S/W to PA mapping, it can only use S/W to nuke the whole cache, + * which is a rather good thing for us. + * + * Also, it is only used when turning caches on/off ("The expected + * usage of the cache maintenance instructions that operate by set/way + * is associated with the cache maintenance instructions associated + * with the powerdown and powerup of caches, if this is required by + * the implementation."). + * + * We use the following policy: + * + * - If we trap a S/W operation, we enable VM trapping to detect + * caches being turned on/off, and do a full clean. + * + * - We flush the caches on both caches being turned on and off. + * + * - Once the caches are enabled, we stop trapping VM ops. + */ +void kvm_set_way_flush(struct kvm_vcpu *vcpu) +{ + unsigned long hcr = *vcpu_hcr(vcpu); + + /* + * If this is the first time we do a S/W operation + * (i.e. HCR_TVM not set) flush the whole memory, and set the + * VM trapping. + * + * Otherwise, rely on the VM trapping to wait for the MMU + + * Caches to be turned off. At that point, we'll be able to + * clean the caches again. + */ + if (!(hcr & HCR_TVM)) { + trace_kvm_set_way_flush(*vcpu_pc(vcpu), + vcpu_has_cache_enabled(vcpu)); + stage2_flush_vm(vcpu->kvm); + *vcpu_hcr(vcpu) = hcr | HCR_TVM; + } +} + +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) +{ + bool now_enabled = vcpu_has_cache_enabled(vcpu); + + /* + * If switching the MMU+caches on, need to invalidate the caches. + * If switching it off, need to clean the caches. + * Clean + invalidate does the trick always. + */ + if (now_enabled != was_enabled) + stage2_flush_vm(vcpu->kvm); + + /* Caches are now on, stop trapping VM ops (until a S/W op) */ + if (now_enabled) + *vcpu_hcr(vcpu) &= ~HCR_TVM; + + trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); +} diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c new file mode 100644 index 000000000000..d45b8b9a4415 --- /dev/null +++ b/arch/arm64/kvm/perf.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Based on the x86 implementation. + * + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier + */ + +#include +#include + +#include + +static int kvm_is_in_guest(void) +{ + return kvm_get_running_vcpu() != NULL; +} + +static int kvm_is_user_mode(void) +{ + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_running_vcpu(); + + if (vcpu) + return !vcpu_mode_priv(vcpu); + + return 0; +} + +static unsigned long kvm_get_guest_ip(void) +{ + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_running_vcpu(); + + if (vcpu) + return *vcpu_pc(vcpu); + + return 0; +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .is_in_guest = kvm_is_in_guest, + .is_user_mode = kvm_is_user_mode, + .get_guest_ip = kvm_get_guest_ip, +}; + +int kvm_perf_init(void) +{ + return perf_register_guest_info_callbacks(&kvm_guest_cbs); +} + +int kvm_perf_teardown(void) +{ + return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c new file mode 100644 index 000000000000..f0d0312c0a55 --- /dev/null +++ b/arch/arm64/kvm/pmu-emul.c @@ -0,0 +1,869 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Linaro Ltd. + * Author: Shannon Zhao + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); +static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx); +static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); + +#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 + +/** + * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter + * @vcpu: The vcpu pointer + * @select_idx: The counter index + */ +static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx) +{ + return (select_idx == ARMV8_PMU_CYCLE_IDX && + __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC); +} + +static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu; + struct kvm_vcpu_arch *vcpu_arch; + + pmc -= pmc->idx; + pmu = container_of(pmc, struct kvm_pmu, pmc[0]); + vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu); + return container_of(vcpu_arch, struct kvm_vcpu, arch); +} + +/** + * kvm_pmu_pmc_is_chained - determine if the pmc is chained + * @pmc: The PMU counter pointer + */ +static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + + return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); +} + +/** + * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter + * @select_idx: The counter index + */ +static bool kvm_pmu_idx_is_high_counter(u64 select_idx) +{ + return select_idx & 0x1; +} + +/** + * kvm_pmu_get_canonical_pmc - obtain the canonical pmc + * @pmc: The PMU counter pointer + * + * When a pair of PMCs are chained together we use the low counter (canonical) + * to hold the underlying perf event. + */ +static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc) +{ + if (kvm_pmu_pmc_is_chained(pmc) && + kvm_pmu_idx_is_high_counter(pmc->idx)) + return pmc - 1; + + return pmc; +} +static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc) +{ + if (kvm_pmu_idx_is_high_counter(pmc->idx)) + return pmc - 1; + else + return pmc + 1; +} + +/** + * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain + * @vcpu: The vcpu pointer + * @select_idx: The counter index + */ +static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx) +{ + u64 eventsel, reg; + + select_idx |= 0x1; + + if (select_idx == ARMV8_PMU_CYCLE_IDX) + return false; + + reg = PMEVTYPER0_EL0 + select_idx; + eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT; + + return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN; +} + +/** + * kvm_pmu_get_pair_counter_value - get PMU counter value + * @vcpu: The vcpu pointer + * @pmc: The PMU counter pointer + */ +static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu, + struct kvm_pmc *pmc) +{ + u64 counter, counter_high, reg, enabled, running; + + if (kvm_pmu_pmc_is_chained(pmc)) { + pmc = kvm_pmu_get_canonical_pmc(pmc); + reg = PMEVCNTR0_EL0 + pmc->idx; + + counter = __vcpu_sys_reg(vcpu, reg); + counter_high = __vcpu_sys_reg(vcpu, reg + 1); + + counter = lower_32_bits(counter) | (counter_high << 32); + } else { + reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) + ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; + counter = __vcpu_sys_reg(vcpu, reg); + } + + /* + * The real counter value is equal to the value of counter register plus + * the value perf event counts. + */ + if (pmc->perf_event) + counter += perf_event_read_value(pmc->perf_event, &enabled, + &running); + + return counter; +} + +/** + * kvm_pmu_get_counter_value - get PMU counter value + * @vcpu: The vcpu pointer + * @select_idx: The counter index + */ +u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) +{ + u64 counter; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc = &pmu->pmc[select_idx]; + + counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + + if (kvm_pmu_pmc_is_chained(pmc) && + kvm_pmu_idx_is_high_counter(select_idx)) + counter = upper_32_bits(counter); + else if (select_idx != ARMV8_PMU_CYCLE_IDX) + counter = lower_32_bits(counter); + + return counter; +} + +/** + * kvm_pmu_set_counter_value - set PMU counter value + * @vcpu: The vcpu pointer + * @select_idx: The counter index + * @val: The counter value + */ +void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) +{ + u64 reg; + + reg = (select_idx == ARMV8_PMU_CYCLE_IDX) + ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; + __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); + + /* Recreate the perf event to reflect the updated sample_period */ + kvm_pmu_create_perf_event(vcpu, select_idx); +} + +/** + * kvm_pmu_release_perf_event - remove the perf event + * @pmc: The PMU counter pointer + */ +static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) +{ + pmc = kvm_pmu_get_canonical_pmc(pmc); + if (pmc->perf_event) { + perf_event_disable(pmc->perf_event); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +/** + * kvm_pmu_stop_counter - stop PMU counter + * @pmc: The PMU counter pointer + * + * If this counter has been configured to monitor some event, release it here. + */ +static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) +{ + u64 counter, reg, val; + + pmc = kvm_pmu_get_canonical_pmc(pmc); + if (!pmc->perf_event) + return; + + counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + + if (pmc->idx == ARMV8_PMU_CYCLE_IDX) { + reg = PMCCNTR_EL0; + val = counter; + } else { + reg = PMEVCNTR0_EL0 + pmc->idx; + val = lower_32_bits(counter); + } + + __vcpu_sys_reg(vcpu, reg) = val; + + if (kvm_pmu_pmc_is_chained(pmc)) + __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter); + + kvm_pmu_release_perf_event(pmc); +} + +/** + * kvm_pmu_vcpu_init - assign pmu counter idx for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + pmu->pmc[i].idx = i; +} + +/** + * kvm_pmu_vcpu_reset - reset pmu state for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) +{ + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int i; + + for_each_set_bit(i, &mask, 32) + kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); + + bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); +} + +/** + * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + kvm_pmu_release_perf_event(&pmu->pmc[i]); +} + +u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT; + + val &= ARMV8_PMU_PMCR_N_MASK; + if (val == 0) + return BIT(ARMV8_PMU_CYCLE_IDX); + else + return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); +} + +/** + * kvm_pmu_enable_counter_mask - enable selected PMU counters + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMCNTENSET register + * + * Call perf_event_enable to start counting the perf event + */ +void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) + return; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + if (!(val & BIT(i))) + continue; + + pmc = &pmu->pmc[i]; + + /* A change in the enable state may affect the chain state */ + kvm_pmu_update_pmc_chained(vcpu, i); + kvm_pmu_create_perf_event(vcpu, i); + + /* At this point, pmc must be the canonical */ + if (pmc->perf_event) { + perf_event_enable(pmc->perf_event); + if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) + kvm_debug("fail to enable perf event\n"); + } + } +} + +/** + * kvm_pmu_disable_counter_mask - disable selected PMU counters + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMCNTENCLR register + * + * Call perf_event_disable to stop counting the perf event + */ +void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + if (!val) + return; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + if (!(val & BIT(i))) + continue; + + pmc = &pmu->pmc[i]; + + /* A change in the enable state may affect the chain state */ + kvm_pmu_update_pmc_chained(vcpu, i); + kvm_pmu_create_perf_event(vcpu, i); + + /* At this point, pmc must be the canonical */ + if (pmc->perf_event) + perf_event_disable(pmc->perf_event); + } +} + +static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) +{ + u64 reg = 0; + + if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) { + reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); + reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); + reg &= kvm_pmu_valid_counter_mask(vcpu); + } + + return reg; +} + +static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool overflow; + + if (!kvm_arm_pmu_v3_ready(vcpu)) + return; + + overflow = !!kvm_pmu_overflow_status(vcpu); + if (pmu->irq_level == overflow) + return; + + pmu->irq_level = overflow; + + if (likely(irqchip_in_kernel(vcpu->kvm))) { + int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + pmu->irq_num, overflow, pmu); + WARN_ON(ret); + } +} + +bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + return pmu->irq_level != run_level; +} + +/* + * Reflect the PMU overflow interrupt output level into the kvm_run structure + */ +void kvm_pmu_update_run(struct kvm_vcpu *vcpu) +{ + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Populate the timer bitmap for user space */ + regs->device_irq_level &= ~KVM_ARM_DEV_PMU; + if (vcpu->arch.pmu.irq_level) + regs->device_irq_level |= KVM_ARM_DEV_PMU; +} + +/** + * kvm_pmu_flush_hwstate - flush pmu state to cpu + * @vcpu: The vcpu pointer + * + * Check if the PMU has overflowed while we were running in the host, and inject + * an interrupt if that was the case. + */ +void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) +{ + kvm_pmu_update_state(vcpu); +} + +/** + * kvm_pmu_sync_hwstate - sync pmu state from cpu + * @vcpu: The vcpu pointer + * + * Check if the PMU has overflowed while we were running in the guest, and + * inject an interrupt if that was the case. + */ +void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) +{ + kvm_pmu_update_state(vcpu); +} + +/** + * When the perf event overflows, set the overflow status and inform the vcpu. + */ +static void kvm_pmu_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu); + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + int idx = pmc->idx; + u64 period; + + cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE); + + /* + * Reset the sample period to the architectural limit, + * i.e. the point where the counter overflows. + */ + period = -(local64_read(&perf_event->count)); + + if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) + period &= GENMASK(31, 0); + + local64_set(&perf_event->hw.period_left, 0); + perf_event->attr.sample_period = period; + perf_event->hw.sample_period = period; + + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); + + if (kvm_pmu_overflow_status(vcpu)) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } + + cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD); +} + +/** + * kvm_pmu_software_increment - do software increment + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMSWINC register + */ +void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int i; + + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) + return; + + /* Weed out disabled counters */ + val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + + for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { + u64 type, reg; + + if (!(val & BIT(i))) + continue; + + /* PMSWINC only applies to ... SW_INC! */ + type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); + type &= ARMV8_PMU_EVTYPE_EVENT; + if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) + continue; + + /* increment this even SW_INC counter */ + reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; + reg = lower_32_bits(reg); + __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; + + if (reg) /* no overflow on the low part */ + continue; + + if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) { + /* increment the high counter */ + reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1; + reg = lower_32_bits(reg); + __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg; + if (!reg) /* mark overflow on the high counter */ + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1); + } else { + /* mark overflow on low counter */ + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); + } + } +} + +/** + * kvm_pmu_handle_pmcr - handle PMCR register + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMCR register + */ +void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) +{ + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + int i; + + if (val & ARMV8_PMU_PMCR_E) { + kvm_pmu_enable_counter_mask(vcpu, + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); + } else { + kvm_pmu_disable_counter_mask(vcpu, mask); + } + + if (val & ARMV8_PMU_PMCR_C) + kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); + + if (val & ARMV8_PMU_PMCR_P) { + for_each_set_bit(i, &mask, 32) + kvm_pmu_set_counter_value(vcpu, i, 0); + } +} + +static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx) +{ + return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && + (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx)); +} + +/** + * kvm_pmu_create_perf_event - create a perf event for a counter + * @vcpu: The vcpu pointer + * @select_idx: The number of selected counter + */ +static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + struct perf_event *event; + struct perf_event_attr attr; + u64 eventsel, counter, reg, data; + + /* + * For chained counters the event type and filtering attributes are + * obtained from the low/even counter. We also use this counter to + * determine if the event is enabled/disabled. + */ + pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]); + + reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) + ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx; + data = __vcpu_sys_reg(vcpu, reg); + + kvm_pmu_stop_counter(vcpu, pmc); + eventsel = data & ARMV8_PMU_EVTYPE_EVENT; + + /* Software increment event does't need to be backed by a perf event */ + if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && + pmc->idx != ARMV8_PMU_CYCLE_IDX) + return; + + memset(&attr, 0, sizeof(struct perf_event_attr)); + attr.type = PERF_TYPE_RAW; + attr.size = sizeof(attr); + attr.pinned = 1; + attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx); + attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0; + attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; + attr.exclude_hv = 1; /* Don't count EL2 events */ + attr.exclude_host = 1; /* Don't count host events */ + attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ? + ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel; + + counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + + if (kvm_pmu_pmc_is_chained(pmc)) { + /** + * The initial sample period (overflow count) of an event. For + * chained counters we only support overflow interrupts on the + * high counter. + */ + attr.sample_period = (-counter) & GENMASK(63, 0); + attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; + + event = perf_event_create_kernel_counter(&attr, -1, current, + kvm_pmu_perf_overflow, + pmc + 1); + } else { + /* The initial sample period (overflow count) of an event. */ + if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) + attr.sample_period = (-counter) & GENMASK(63, 0); + else + attr.sample_period = (-counter) & GENMASK(31, 0); + + event = perf_event_create_kernel_counter(&attr, -1, current, + kvm_pmu_perf_overflow, pmc); + } + + if (IS_ERR(event)) { + pr_err_once("kvm: pmu event creation failed %ld\n", + PTR_ERR(event)); + return; + } + + pmc->perf_event = event; +} + +/** + * kvm_pmu_update_pmc_chained - update chained bitmap + * @vcpu: The vcpu pointer + * @select_idx: The number of selected counter + * + * Update the chained bitmap based on the event type written in the + * typer register and the enable state of the odd register. + */ +static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc; + bool new_state, old_state; + + old_state = kvm_pmu_pmc_is_chained(pmc); + new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) && + kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1); + + if (old_state == new_state) + return; + + canonical_pmc = kvm_pmu_get_canonical_pmc(pmc); + kvm_pmu_stop_counter(vcpu, canonical_pmc); + if (new_state) { + /* + * During promotion from !chained to chained we must ensure + * the adjacent counter is stopped and its event destroyed + */ + kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc)); + set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); + return; + } + clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); +} + +/** + * kvm_pmu_set_counter_event_type - set selected counter to monitor some event + * @vcpu: The vcpu pointer + * @data: The data guest writes to PMXEVTYPER_EL0 + * @select_idx: The number of selected counter + * + * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an + * event with given hardware event number. Here we call perf_event API to + * emulate this action and create a kernel perf event for it. + */ +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, + u64 select_idx) +{ + u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK; + + reg = (select_idx == ARMV8_PMU_CYCLE_IDX) + ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx; + + __vcpu_sys_reg(vcpu, reg) = event_type; + + kvm_pmu_update_pmc_chained(vcpu, select_idx); + kvm_pmu_create_perf_event(vcpu, select_idx); +} + +bool kvm_arm_support_pmu_v3(void) +{ + /* + * Check if HW_PERF_EVENTS are supported by checking the number of + * hardware performance counters. This could ensure the presence of + * a physical PMU and CONFIG_PERF_EVENT is selected. + */ + return (perf_num_counters() > 0); +} + +int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.pmu.created) + return 0; + + /* + * A valid interrupt configuration for the PMU is either to have a + * properly configured interrupt number and using an in-kernel + * irqchip, or to not have an in-kernel GIC and not set an IRQ. + */ + if (irqchip_in_kernel(vcpu->kvm)) { + int irq = vcpu->arch.pmu.irq_num; + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -EINVAL; + + /* + * If we are using an in-kernel vgic, at this point we know + * the vgic will be initialized, so we can check the PMU irq + * number against the dimensions of the vgic and make sure + * it's valid. + */ + if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) + return -EINVAL; + } else if (kvm_arm_pmu_irq_initialized(vcpu)) { + return -EINVAL; + } + + kvm_pmu_vcpu_reset(vcpu); + vcpu->arch.pmu.ready = true; + + return 0; +} + +static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) +{ + if (!kvm_arm_support_pmu_v3()) + return -ENODEV; + + if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return -ENXIO; + + if (vcpu->arch.pmu.created) + return -EBUSY; + + if (irqchip_in_kernel(vcpu->kvm)) { + int ret; + + /* + * If using the PMU with an in-kernel virtual GIC + * implementation, we require the GIC to be already + * initialized when initializing the PMU. + */ + if (!vgic_initialized(vcpu->kvm)) + return -ENODEV; + + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -ENXIO; + + ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, + &vcpu->arch.pmu); + if (ret) + return ret; + } + + vcpu->arch.pmu.created = true; + return 0; +} + +/* + * For one VM the interrupt type must be same for each vcpu. + * As a PPI, the interrupt number is the same for all vcpus, + * while as an SPI it must be a separate number per vcpu. + */ +static bool pmu_irq_is_valid(struct kvm *kvm, int irq) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_arm_pmu_irq_initialized(vcpu)) + continue; + + if (irq_is_ppi(irq)) { + if (vcpu->arch.pmu.irq_num != irq) + return false; + } else { + if (vcpu->arch.pmu.irq_num == irq) + return false; + } + } + + return true; +} + +int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: { + int __user *uaddr = (int __user *)(long)attr->addr; + int irq; + + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; + + if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return -ENODEV; + + if (get_user(irq, uaddr)) + return -EFAULT; + + /* The PMU overflow interrupt can be a PPI or a valid SPI. */ + if (!(irq_is_ppi(irq) || irq_is_spi(irq))) + return -EINVAL; + + if (!pmu_irq_is_valid(vcpu->kvm, irq)) + return -EINVAL; + + if (kvm_arm_pmu_irq_initialized(vcpu)) + return -EBUSY; + + kvm_debug("Set kvm ARM PMU irq: %d\n", irq); + vcpu->arch.pmu.irq_num = irq; + return 0; + } + case KVM_ARM_VCPU_PMU_V3_INIT: + return kvm_arm_pmu_v3_init(vcpu); + } + + return -ENXIO; +} + +int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: { + int __user *uaddr = (int __user *)(long)attr->addr; + int irq; + + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; + + if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return -ENODEV; + + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -ENXIO; + + irq = vcpu->arch.pmu.irq_num; + return put_user(irq, uaddr); + } + } + + return -ENXIO; +} + +int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: + case KVM_ARM_VCPU_PMU_V3_INIT: + if (kvm_arm_support_pmu_v3() && + test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return 0; + } + + return -ENXIO; +} diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c new file mode 100644 index 000000000000..ae364716ee40 --- /dev/null +++ b/arch/arm64/kvm/psci.c @@ -0,0 +1,564 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - ARM Ltd + * Author: Marc Zyngier + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +/* + * This is an implementation of the Power State Coordination Interface + * as described in ARM document number ARM DEN 0022A. + */ + +#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) + +static unsigned long psci_affinity_mask(unsigned long affinity_level) +{ + if (affinity_level <= 3) + return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level); + + return 0; +} + +static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + /* + * NOTE: For simplicity, we make VCPU suspend emulation to be + * same-as WFI (Wait-for-interrupt) emulation. + * + * This means for KVM the wakeup events are interrupts and + * this is consistent with intended use of StateID as described + * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A). + * + * Further, we also treat power-down request to be same as + * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2 + * specification (ARM DEN 0022A). This means all suspend states + * for KVM will preserve the register state. + */ + kvm_vcpu_block(vcpu); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); + + return PSCI_RET_SUCCESS; +} + +static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) +{ + vcpu->arch.power_off = true; + kvm_make_request(KVM_REQ_SLEEP, vcpu); + kvm_vcpu_kick(vcpu); +} + +static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) +{ + struct vcpu_reset_state *reset_state; + struct kvm *kvm = source_vcpu->kvm; + struct kvm_vcpu *vcpu = NULL; + unsigned long cpu_id; + + cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; + if (vcpu_mode_is_32bit(source_vcpu)) + cpu_id &= ~((u32) 0); + + vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); + + /* + * Make sure the caller requested a valid CPU and that the CPU is + * turned off. + */ + if (!vcpu) + return PSCI_RET_INVALID_PARAMS; + if (!vcpu->arch.power_off) { + if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1) + return PSCI_RET_ALREADY_ON; + else + return PSCI_RET_INVALID_PARAMS; + } + + reset_state = &vcpu->arch.reset_state; + + reset_state->pc = smccc_get_arg2(source_vcpu); + + /* Propagate caller endianness */ + reset_state->be = kvm_vcpu_is_be(source_vcpu); + + /* + * NOTE: We always update r0 (or x0) because for PSCI v0.1 + * the general puspose registers are undefined upon CPU_ON. + */ + reset_state->r0 = smccc_get_arg3(source_vcpu); + + WRITE_ONCE(reset_state->reset, true); + kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); + + /* + * Make sure the reset request is observed if the change to + * power_state is observed. + */ + smp_wmb(); + + vcpu->arch.power_off = false; + kvm_vcpu_wake_up(vcpu); + + return PSCI_RET_SUCCESS; +} + +static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) +{ + int i, matching_cpus = 0; + unsigned long mpidr; + unsigned long target_affinity; + unsigned long target_affinity_mask; + unsigned long lowest_affinity_level; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tmp; + + target_affinity = smccc_get_arg1(vcpu); + lowest_affinity_level = smccc_get_arg2(vcpu); + + /* Determine target affinity mask */ + target_affinity_mask = psci_affinity_mask(lowest_affinity_level); + if (!target_affinity_mask) + return PSCI_RET_INVALID_PARAMS; + + /* Ignore other bits of target affinity */ + target_affinity &= target_affinity_mask; + + /* + * If one or more VCPU matching target affinity are running + * then ON else OFF + */ + kvm_for_each_vcpu(i, tmp, kvm) { + mpidr = kvm_vcpu_get_mpidr_aff(tmp); + if ((mpidr & target_affinity_mask) == target_affinity) { + matching_cpus++; + if (!tmp->arch.power_off) + return PSCI_0_2_AFFINITY_LEVEL_ON; + } + } + + if (!matching_cpus) + return PSCI_RET_INVALID_PARAMS; + + return PSCI_0_2_AFFINITY_LEVEL_OFF; +} + +static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) +{ + int i; + struct kvm_vcpu *tmp; + + /* + * The KVM ABI specifies that a system event exit may call KVM_RUN + * again and may perform shutdown/reboot at a later time that when the + * actual request is made. Since we are implementing PSCI and a + * caller of PSCI reboot and shutdown expects that the system shuts + * down or reboots immediately, let's make sure that VCPUs are not run + * after this call is handled and before the VCPUs have been + * re-initialized. + */ + kvm_for_each_vcpu(i, tmp, vcpu->kvm) + tmp->arch.power_off = true; + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); + + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); + vcpu->run->system_event.type = type; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; +} + +static void kvm_psci_system_off(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN); +} + +static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET); +} + +static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) +{ + int i; + + /* + * Zero the input registers' upper 32 bits. They will be fully + * zeroed on exit, so we're fine changing them in place. + */ + for (i = 1; i < 4; i++) + vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i))); +} + +static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn) +{ + switch(fn) { + case PSCI_0_2_FN64_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_ON: + case PSCI_0_2_FN64_AFFINITY_INFO: + /* Disallow these functions for 32bit guests */ + if (vcpu_mode_is_32bit(vcpu)) + return PSCI_RET_NOT_SUPPORTED; + break; + } + + return 0; +} + +static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u32 psci_fn = smccc_get_function(vcpu); + unsigned long val; + int ret = 1; + + val = kvm_psci_check_allowed_function(vcpu, psci_fn); + if (val) + goto out; + + switch (psci_fn) { + case PSCI_0_2_FN_PSCI_VERSION: + /* + * Bits[31:16] = Major Version = 0 + * Bits[15:0] = Minor Version = 2 + */ + val = KVM_ARM_PSCI_0_2; + break; + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + val = kvm_psci_vcpu_suspend(vcpu); + break; + case PSCI_0_2_FN_CPU_OFF: + kvm_psci_vcpu_off(vcpu); + val = PSCI_RET_SUCCESS; + break; + case PSCI_0_2_FN_CPU_ON: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_0_2_FN64_CPU_ON: + mutex_lock(&kvm->lock); + val = kvm_psci_vcpu_on(vcpu); + mutex_unlock(&kvm->lock); + break; + case PSCI_0_2_FN_AFFINITY_INFO: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_0_2_FN64_AFFINITY_INFO: + val = kvm_psci_vcpu_affinity_info(vcpu); + break; + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: + /* + * Trusted OS is MP hence does not require migration + * or + * Trusted OS is not present + */ + val = PSCI_0_2_TOS_MP; + break; + case PSCI_0_2_FN_SYSTEM_OFF: + kvm_psci_system_off(vcpu); + /* + * We should'nt be going back to guest VCPU after + * receiving SYSTEM_OFF request. + * + * If user space accidently/deliberately resumes + * guest VCPU after SYSTEM_OFF request then guest + * VCPU should see internal failure from PSCI return + * value. To achieve this, we preload r0 (or x0) with + * PSCI return value INTERNAL_FAILURE. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; + case PSCI_0_2_FN_SYSTEM_RESET: + kvm_psci_system_reset(vcpu); + /* + * Same reason as SYSTEM_OFF for preloading r0 (or x0) + * with PSCI return value INTERNAL_FAILURE. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + +out: + smccc_set_retval(vcpu, val, 0, 0, 0); + return ret; +} + +static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu) +{ + u32 psci_fn = smccc_get_function(vcpu); + u32 feature; + unsigned long val; + int ret = 1; + + switch(psci_fn) { + case PSCI_0_2_FN_PSCI_VERSION: + val = KVM_ARM_PSCI_1_0; + break; + case PSCI_1_0_FN_PSCI_FEATURES: + feature = smccc_get_arg1(vcpu); + val = kvm_psci_check_allowed_function(vcpu, feature); + if (val) + break; + + switch(feature) { + case PSCI_0_2_FN_PSCI_VERSION: + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + case PSCI_0_2_FN_CPU_OFF: + case PSCI_0_2_FN_CPU_ON: + case PSCI_0_2_FN64_CPU_ON: + case PSCI_0_2_FN_AFFINITY_INFO: + case PSCI_0_2_FN64_AFFINITY_INFO: + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: + case PSCI_0_2_FN_SYSTEM_OFF: + case PSCI_0_2_FN_SYSTEM_RESET: + case PSCI_1_0_FN_PSCI_FEATURES: + case ARM_SMCCC_VERSION_FUNC_ID: + val = 0; + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + break; + default: + return kvm_psci_0_2_call(vcpu); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return ret; +} + +static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u32 psci_fn = smccc_get_function(vcpu); + unsigned long val; + + switch (psci_fn) { + case KVM_PSCI_FN_CPU_OFF: + kvm_psci_vcpu_off(vcpu); + val = PSCI_RET_SUCCESS; + break; + case KVM_PSCI_FN_CPU_ON: + mutex_lock(&kvm->lock); + val = kvm_psci_vcpu_on(vcpu); + mutex_unlock(&kvm->lock); + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} + +/** + * kvm_psci_call - handle PSCI call if r0 value is in range + * @vcpu: Pointer to the VCPU struct + * + * Handle PSCI calls from guests through traps from HVC instructions. + * The calling convention is similar to SMC calls to the secure world + * where the function number is placed in r0. + * + * This function returns: > 0 (success), 0 (success but exit to user + * space), and < 0 (errors) + * + * Errors: + * -EINVAL: Unrecognized PSCI function + */ +int kvm_psci_call(struct kvm_vcpu *vcpu) +{ + switch (kvm_psci_version(vcpu, vcpu->kvm)) { + case KVM_ARM_PSCI_1_0: + return kvm_psci_1_0_call(vcpu); + case KVM_ARM_PSCI_0_2: + return kvm_psci_0_2_call(vcpu); + case KVM_ARM_PSCI_0_1: + return kvm_psci_0_1_call(vcpu); + default: + return -EINVAL; + }; +} + +int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) +{ + return 3; /* PSCI version and two workaround registers */ +} + +int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++)) + return -EFAULT; + + if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++)) + return -EFAULT; + + if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++)) + return -EFAULT; + + return 0; +} + +#define KVM_REG_FEATURE_LEVEL_WIDTH 4 +#define KVM_REG_FEATURE_LEVEL_MASK (BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1) + +/* + * Convert the workaround level into an easy-to-compare number, where higher + * values mean better protection. + */ +static int get_kernel_wa_level(u64 regid) +{ + switch (regid) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + switch (kvm_arm_harden_branch_predictor()) { + case KVM_BP_HARDEN_UNKNOWN: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case KVM_BP_HARDEN_WA_NEEDED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL; + case KVM_BP_HARDEN_NOT_REQUIRED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + switch (kvm_arm_have_ssbd()) { + case KVM_SSBD_FORCE_DISABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + case KVM_SSBD_KERNEL: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL; + case KVM_SSBD_FORCE_ENABLE: + case KVM_SSBD_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; + case KVM_SSBD_UNKNOWN: + default: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN; + } + } + + return -EINVAL; +} + +int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + val = kvm_psci_version(vcpu, vcpu->kvm); + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + + if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL && + kvm_arm_get_vcpu_workaround_2_flag(vcpu)) + val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED; + break; + default: + return -ENOENT; + } + + if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + int wa_level; + + if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + { + bool wants_02; + + wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); + + switch (val) { + case KVM_ARM_PSCI_0_1: + if (wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + case KVM_ARM_PSCI_0_2: + case KVM_ARM_PSCI_1_0: + if (!wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + } + break; + } + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + if (val & ~KVM_REG_FEATURE_LEVEL_MASK) + return -EINVAL; + + if (get_kernel_wa_level(reg->id) < val) + return -EINVAL; + + return 0; + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + if (val & ~(KVM_REG_FEATURE_LEVEL_MASK | + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED)) + return -EINVAL; + + wa_level = val & KVM_REG_FEATURE_LEVEL_MASK; + + if (get_kernel_wa_level(reg->id) < wa_level) + return -EINVAL; + + /* The enabled bit must not be set unless the level is AVAIL. */ + if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL && + wa_level != val) + return -EINVAL; + + /* Are we finished or do we need to check the enable bit ? */ + if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL) + return 0; + + /* + * If this kernel supports the workaround to be switched on + * or off, make sure it matches the requested setting. + */ + switch (wa_level) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: + kvm_arm_set_vcpu_workaround_2_flag(vcpu, + val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED); + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: + kvm_arm_set_vcpu_workaround_2_flag(vcpu, true); + break; + } + + return 0; + default: + return -ENOENT; + } + + return -EINVAL; +} diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c new file mode 100644 index 000000000000..1e0f4c284888 --- /dev/null +++ b/arch/arm64/kvm/pvtime.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Arm Ltd. + +#include +#include + +#include +#include + +#include + +void kvm_update_stolen_time(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 steal; + __le64 steal_le; + u64 offset; + int idx; + u64 base = vcpu->arch.steal.base; + + if (base == GPA_INVALID) + return; + + /* Let's do the local bookkeeping */ + steal = vcpu->arch.steal.steal; + steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal; + vcpu->arch.steal.last_steal = current->sched_info.run_delay; + vcpu->arch.steal.steal = steal; + + steal_le = cpu_to_le64(steal); + idx = srcu_read_lock(&kvm->srcu); + offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); + kvm_put_guest(kvm, base + offset, steal_le, u64); + srcu_read_unlock(&kvm->srcu, idx); +} + +long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) +{ + u32 feature = smccc_get_arg1(vcpu); + long val = SMCCC_RET_NOT_SUPPORTED; + + switch (feature) { + case ARM_SMCCC_HV_PV_TIME_FEATURES: + case ARM_SMCCC_HV_PV_TIME_ST: + val = SMCCC_RET_SUCCESS; + break; + } + + return val; +} + +gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) +{ + struct pvclock_vcpu_stolen_time init_values = {}; + struct kvm *kvm = vcpu->kvm; + u64 base = vcpu->arch.steal.base; + int idx; + + if (base == GPA_INVALID) + return base; + + /* + * Start counting stolen time from the time the guest requests + * the feature enabled. + */ + vcpu->arch.steal.steal = 0; + vcpu->arch.steal.last_steal = current->sched_info.run_delay; + + idx = srcu_read_lock(&kvm->srcu); + kvm_write_guest(kvm, base, &init_values, sizeof(init_values)); + srcu_read_unlock(&kvm->srcu, idx); + + return base; +} + +int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *user = (u64 __user *)attr->addr; + struct kvm *kvm = vcpu->kvm; + u64 ipa; + int ret = 0; + int idx; + + if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + return -ENXIO; + + if (get_user(ipa, user)) + return -EFAULT; + if (!IS_ALIGNED(ipa, 64)) + return -EINVAL; + if (vcpu->arch.steal.base != GPA_INVALID) + return -EEXIST; + + /* Check the address is in a valid memslot */ + idx = srcu_read_lock(&kvm->srcu); + if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT))) + ret = -EINVAL; + srcu_read_unlock(&kvm->srcu, idx); + + if (!ret) + vcpu->arch.steal.base = ipa; + + return ret; +} + +int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *user = (u64 __user *)attr->addr; + u64 ipa; + + if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + return -ENXIO; + + ipa = vcpu->arch.steal.base; + + if (put_user(ipa, user)) + return -EFAULT; + return 0; +} + +int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PVTIME_IPA: + return 0; + } + return -ENXIO; +} diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h index eab91ad0effb..86f9ea47be29 100644 --- a/arch/arm64/kvm/trace.h +++ b/arch/arm64/kvm/trace.h @@ -1,216 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(_TRACE_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#ifndef _TRACE_ARM64_KVM_H #define _TRACE_ARM64_KVM_H -#include -#include "sys_regs.h" +#include "trace_arm.h" +#include "trace_handle_exit.h" -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -TRACE_EVENT(kvm_wfx_arm64, - TP_PROTO(unsigned long vcpu_pc, bool is_wfe), - TP_ARGS(vcpu_pc, is_wfe), - - TP_STRUCT__entry( - __field(unsigned long, vcpu_pc) - __field(bool, is_wfe) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->is_wfe = is_wfe; - ), - - TP_printk("guest executed wf%c at: 0x%08lx", - __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) -); - -TRACE_EVENT(kvm_hvc_arm64, - TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), - TP_ARGS(vcpu_pc, r0, imm), - - TP_STRUCT__entry( - __field(unsigned long, vcpu_pc) - __field(unsigned long, r0) - __field(unsigned long, imm) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->r0 = r0; - __entry->imm = imm; - ), - - TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)", - __entry->vcpu_pc, __entry->r0, __entry->imm) -); - -TRACE_EVENT(kvm_arm_setup_debug, - TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), - TP_ARGS(vcpu, guest_debug), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->guest_debug = guest_debug; - ), - - TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) -); - -TRACE_EVENT(kvm_arm_clear_debug, - TP_PROTO(__u32 guest_debug), - TP_ARGS(guest_debug), - - TP_STRUCT__entry( - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->guest_debug = guest_debug; - ), - - TP_printk("flags: 0x%08x", __entry->guest_debug) -); - -TRACE_EVENT(kvm_arm_set_dreg32, - TP_PROTO(const char *name, __u32 value), - TP_ARGS(name, value), - - TP_STRUCT__entry( - __field(const char *, name) - __field(__u32, value) - ), - - TP_fast_assign( - __entry->name = name; - __entry->value = value; - ), - - TP_printk("%s: 0x%08x", __entry->name, __entry->value) -); - -TRACE_DEFINE_SIZEOF(__u64); - -TRACE_EVENT(kvm_arm_set_regset, - TP_PROTO(const char *type, int len, __u64 *control, __u64 *value), - TP_ARGS(type, len, control, value), - TP_STRUCT__entry( - __field(const char *, name) - __field(int, len) - __array(u64, ctrls, 16) - __array(u64, values, 16) - ), - TP_fast_assign( - __entry->name = type; - __entry->len = len; - memcpy(__entry->ctrls, control, len << 3); - memcpy(__entry->values, value, len << 3); - ), - TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name, - __print_array(__entry->ctrls, __entry->len, sizeof(__u64)), - __print_array(__entry->values, __entry->len, sizeof(__u64))) -); - -TRACE_EVENT(trap_reg, - TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value), - TP_ARGS(fn, reg, is_write, write_value), - - TP_STRUCT__entry( - __field(const char *, fn) - __field(int, reg) - __field(bool, is_write) - __field(u64, write_value) - ), - - TP_fast_assign( - __entry->fn = fn; - __entry->reg = reg; - __entry->is_write = is_write; - __entry->write_value = write_value; - ), - - TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) -); - -TRACE_EVENT(kvm_handle_sys_reg, - TP_PROTO(unsigned long hsr), - TP_ARGS(hsr), - - TP_STRUCT__entry( - __field(unsigned long, hsr) - ), - - TP_fast_assign( - __entry->hsr = hsr; - ), - - TP_printk("HSR 0x%08lx", __entry->hsr) -); - -TRACE_EVENT(kvm_sys_access, - TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), - TP_ARGS(vcpu_pc, params, reg), - - TP_STRUCT__entry( - __field(unsigned long, vcpu_pc) - __field(bool, is_write) - __field(const char *, name) - __field(u8, Op0) - __field(u8, Op1) - __field(u8, CRn) - __field(u8, CRm) - __field(u8, Op2) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->is_write = params->is_write; - __entry->name = reg->name; - __entry->Op0 = reg->Op0; - __entry->Op0 = reg->Op0; - __entry->Op1 = reg->Op1; - __entry->CRn = reg->CRn; - __entry->CRm = reg->CRm; - __entry->Op2 = reg->Op2; - ), - - TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", - __entry->vcpu_pc, __entry->name ?: "UNKN", - __entry->Op0, __entry->Op1, __entry->CRn, - __entry->CRm, __entry->Op2, - __entry->is_write ? "write" : "read") -); - -TRACE_EVENT(kvm_set_guest_debug, - TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), - TP_ARGS(vcpu, guest_debug), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->guest_debug = guest_debug; - ), - - TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) -); - - -#endif /* _TRACE_ARM64_KVM_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include +#endif /* _TRACE_ARM64_KVM_H */ diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h new file mode 100644 index 000000000000..4c71270cc097 --- /dev/null +++ b/arch/arm64/kvm/trace_arm.h @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ARM_ARM64_KVM_H + +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +/* + * Tracepoints for entry/exit to guest + */ +TRACE_EVENT(kvm_entry, + TP_PROTO(unsigned long vcpu_pc), + TP_ARGS(vcpu_pc), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + ), + + TP_printk("PC: 0x%08lx", __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), + TP_ARGS(ret, esr_ec, vcpu_pc), + + TP_STRUCT__entry( + __field( int, ret ) + __field( unsigned int, esr_ec ) + __field( unsigned long, vcpu_pc ) + ), + + TP_fast_assign( + __entry->ret = ARM_EXCEPTION_CODE(ret); + __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; + __entry->vcpu_pc = vcpu_pc; + ), + + TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", + __print_symbolic(__entry->ret, kvm_arm_exception_type), + __entry->esr_ec, + __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), + __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_guest_fault, + TP_PROTO(unsigned long vcpu_pc, unsigned long hsr, + unsigned long hxfar, + unsigned long long ipa), + TP_ARGS(vcpu_pc, hsr, hxfar, ipa), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, hsr ) + __field( unsigned long, hxfar ) + __field( unsigned long long, ipa ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->hsr = hsr; + __entry->hxfar = hxfar; + __entry->ipa = ipa; + ), + + TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx", + __entry->ipa, __entry->hsr, + __entry->hxfar, __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_access_fault, + TP_PROTO(unsigned long ipa), + TP_ARGS(ipa), + + TP_STRUCT__entry( + __field( unsigned long, ipa ) + ), + + TP_fast_assign( + __entry->ipa = ipa; + ), + + TP_printk("IPA: %lx", __entry->ipa) +); + +TRACE_EVENT(kvm_irq_line, + TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level), + TP_ARGS(type, vcpu_idx, irq_num, level), + + TP_STRUCT__entry( + __field( unsigned int, type ) + __field( int, vcpu_idx ) + __field( int, irq_num ) + __field( int, level ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->vcpu_idx = vcpu_idx; + __entry->irq_num = irq_num; + __entry->level = level; + ), + + TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d", + (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" : + (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" : + (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN", + __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level) +); + +TRACE_EVENT(kvm_mmio_emulate, + TP_PROTO(unsigned long vcpu_pc, unsigned long instr, + unsigned long cpsr), + TP_ARGS(vcpu_pc, instr, cpsr), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, instr ) + __field( unsigned long, cpsr ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->instr = instr; + __entry->cpsr = cpsr; + ), + + TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)", + __entry->vcpu_pc, __entry->instr, __entry->cpsr) +); + +TRACE_EVENT(kvm_unmap_hva_range, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier unmap range: %#08lx -- %#08lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_set_spte_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva) +); + +TRACE_EVENT(kvm_age_hva, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier age hva: %#08lx -- %#08lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_test_age_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier test age hva: %#08lx", __entry->hva) +); + +TRACE_EVENT(kvm_set_way_flush, + TP_PROTO(unsigned long vcpu_pc, bool cache), + TP_ARGS(vcpu_pc, cache), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, cache ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->cache = cache; + ), + + TP_printk("S/W flush at 0x%016lx (cache %s)", + __entry->vcpu_pc, __entry->cache ? "on" : "off") +); + +TRACE_EVENT(kvm_toggle_cache, + TP_PROTO(unsigned long vcpu_pc, bool was, bool now), + TP_ARGS(vcpu_pc, was, now), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, was ) + __field( bool, now ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->was = was; + __entry->now = now; + ), + + TP_printk("VM op at 0x%016lx (cache was %s, now %s)", + __entry->vcpu_pc, __entry->was ? "on" : "off", + __entry->now ? "on" : "off") +); + +/* + * Tracepoints for arch_timer + */ +TRACE_EVENT(kvm_timer_update_irq, + TP_PROTO(unsigned long vcpu_id, __u32 irq, int level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( int, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +TRACE_EVENT(kvm_get_timer_map, + TP_PROTO(unsigned long vcpu_id, struct timer_map *map), + TP_ARGS(vcpu_id, map), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( int, direct_vtimer ) + __field( int, direct_ptimer ) + __field( int, emul_ptimer ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer); + __entry->direct_ptimer = + (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1; + __entry->emul_ptimer = + (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1; + ), + + TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d", + __entry->vcpu_id, + __entry->direct_vtimer, + __entry->direct_ptimer, + __entry->emul_ptimer) +); + +TRACE_EVENT(kvm_timer_save_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_restore_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_hrtimer_expire, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_emulate, + TP_PROTO(struct arch_timer_context *ctx, bool should_fire), + TP_ARGS(ctx, should_fire), + + TP_STRUCT__entry( + __field( int, timer_idx ) + __field( bool, should_fire ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + __entry->should_fire = should_fire; + ), + + TP_printk("arch_timer_ctx_index: %d (should_fire: %d)", + __entry->timer_idx, __entry->should_fire) +); + +#endif /* _TRACE_ARM_ARM64_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_arm + +/* This part must be outside protection */ +#include diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h new file mode 100644 index 000000000000..2c56d1e0f5bd --- /dev/null +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_HANDLE_EXIT_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HANDLE_EXIT_ARM64_KVM_H + +#include +#include "sys_regs.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +TRACE_EVENT(kvm_wfx_arm64, + TP_PROTO(unsigned long vcpu_pc, bool is_wfe), + TP_ARGS(vcpu_pc, is_wfe), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_wfe) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_wfe = is_wfe; + ), + + TP_printk("guest executed wf%c at: 0x%08lx", + __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_hvc_arm64, + TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), + TP_ARGS(vcpu_pc, r0, imm), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(unsigned long, r0) + __field(unsigned long, imm) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->r0 = r0; + __entry->imm = imm; + ), + + TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)", + __entry->vcpu_pc, __entry->r0, __entry->imm) +); + +TRACE_EVENT(kvm_arm_setup_debug, + TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), + TP_ARGS(vcpu, guest_debug), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->guest_debug = guest_debug; + ), + + TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) +); + +TRACE_EVENT(kvm_arm_clear_debug, + TP_PROTO(__u32 guest_debug), + TP_ARGS(guest_debug), + + TP_STRUCT__entry( + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->guest_debug = guest_debug; + ), + + TP_printk("flags: 0x%08x", __entry->guest_debug) +); + +TRACE_EVENT(kvm_arm_set_dreg32, + TP_PROTO(const char *name, __u32 value), + TP_ARGS(name, value), + + TP_STRUCT__entry( + __field(const char *, name) + __field(__u32, value) + ), + + TP_fast_assign( + __entry->name = name; + __entry->value = value; + ), + + TP_printk("%s: 0x%08x", __entry->name, __entry->value) +); + +TRACE_DEFINE_SIZEOF(__u64); + +TRACE_EVENT(kvm_arm_set_regset, + TP_PROTO(const char *type, int len, __u64 *control, __u64 *value), + TP_ARGS(type, len, control, value), + TP_STRUCT__entry( + __field(const char *, name) + __field(int, len) + __array(u64, ctrls, 16) + __array(u64, values, 16) + ), + TP_fast_assign( + __entry->name = type; + __entry->len = len; + memcpy(__entry->ctrls, control, len << 3); + memcpy(__entry->values, value, len << 3); + ), + TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name, + __print_array(__entry->ctrls, __entry->len, sizeof(__u64)), + __print_array(__entry->values, __entry->len, sizeof(__u64))) +); + +TRACE_EVENT(trap_reg, + TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value), + TP_ARGS(fn, reg, is_write, write_value), + + TP_STRUCT__entry( + __field(const char *, fn) + __field(int, reg) + __field(bool, is_write) + __field(u64, write_value) + ), + + TP_fast_assign( + __entry->fn = fn; + __entry->reg = reg; + __entry->is_write = is_write; + __entry->write_value = write_value; + ), + + TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) +); + +TRACE_EVENT(kvm_handle_sys_reg, + TP_PROTO(unsigned long hsr), + TP_ARGS(hsr), + + TP_STRUCT__entry( + __field(unsigned long, hsr) + ), + + TP_fast_assign( + __entry->hsr = hsr; + ), + + TP_printk("HSR 0x%08lx", __entry->hsr) +); + +TRACE_EVENT(kvm_sys_access, + TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), + TP_ARGS(vcpu_pc, params, reg), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_write) + __field(const char *, name) + __field(u8, Op0) + __field(u8, Op1) + __field(u8, CRn) + __field(u8, CRm) + __field(u8, Op2) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_write = params->is_write; + __entry->name = reg->name; + __entry->Op0 = reg->Op0; + __entry->Op0 = reg->Op0; + __entry->Op1 = reg->Op1; + __entry->CRn = reg->CRn; + __entry->CRm = reg->CRm; + __entry->Op2 = reg->Op2; + ), + + TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", + __entry->vcpu_pc, __entry->name ?: "UNKN", + __entry->Op0, __entry->Op1, __entry->CRn, + __entry->CRm, __entry->Op2, + __entry->is_write ? "write" : "read") +); + +TRACE_EVENT(kvm_set_guest_debug, + TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), + TP_ARGS(vcpu, guest_debug), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->guest_debug = guest_debug; + ), + + TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) +); + +#endif /* _TRACE_HANDLE_EXIT_ARM64_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_handle_exit + +/* This part must be outside protection */ +#include diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index e7d1ea92095d..2f92bdcb1188 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -7,7 +7,7 @@ #include #include #include -#include "vgic.h" +#include "vgic/vgic.h" #include "sys_regs.h" static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, diff --git a/arch/arm64/kvm/vgic/trace.h b/arch/arm64/kvm/vgic/trace.h new file mode 100644 index 000000000000..83c64401a7fc --- /dev/null +++ b/arch/arm64/kvm/vgic/trace.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VGIC_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +TRACE_EVENT(vgic_update_irq_pending, + TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( bool, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level: %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +#endif /* _TRACE_VGIC_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/arm64/kvm/vgic +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c new file mode 100644 index 000000000000..b13a9e3f99dd --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 Linaro + * Author: Christoffer Dall + */ + +#include +#include +#include +#include +#include +#include +#include +#include "vgic.h" + +/* + * Structure to control looping through the entire vgic state. We start at + * zero for each field and move upwards. So, if dist_id is 0 we print the + * distributor info. When dist_id is 1, we have already printed it and move + * on. + * + * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and + * so on. + */ +struct vgic_state_iter { + int nr_cpus; + int nr_spis; + int nr_lpis; + int dist_id; + int vcpu_id; + int intid; + int lpi_idx; + u32 *lpi_array; +}; + +static void iter_next(struct vgic_state_iter *iter) +{ + if (iter->dist_id == 0) { + iter->dist_id++; + return; + } + + iter->intid++; + if (iter->intid == VGIC_NR_PRIVATE_IRQS && + ++iter->vcpu_id < iter->nr_cpus) + iter->intid = 0; + + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) { + if (iter->lpi_idx < iter->nr_lpis) + iter->intid = iter->lpi_array[iter->lpi_idx]; + iter->lpi_idx++; + } +} + +static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, + loff_t pos) +{ + int nr_cpus = atomic_read(&kvm->online_vcpus); + + memset(iter, 0, sizeof(*iter)); + + iter->nr_cpus = nr_cpus; + iter->nr_spis = kvm->arch.vgic.nr_spis; + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array); + if (iter->nr_lpis < 0) + iter->nr_lpis = 0; + } + + /* Fast forward to the right position if needed */ + while (pos--) + iter_next(iter); +} + +static bool end_of_vgic(struct vgic_state_iter *iter) +{ + return iter->dist_id > 0 && + iter->vcpu_id == iter->nr_cpus && + iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && + iter->lpi_idx > iter->nr_lpis; +} + +static void *vgic_debug_start(struct seq_file *s, loff_t *pos) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter; + + mutex_lock(&kvm->lock); + iter = kvm->arch.vgic.iter; + if (iter) { + iter = ERR_PTR(-EBUSY); + goto out; + } + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + iter = ERR_PTR(-ENOMEM); + goto out; + } + + iter_init(kvm, iter, *pos); + kvm->arch.vgic.iter = iter; + + if (end_of_vgic(iter)) + iter = NULL; +out: + mutex_unlock(&kvm->lock); + return iter; +} + +static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter = kvm->arch.vgic.iter; + + ++*pos; + iter_next(iter); + if (end_of_vgic(iter)) + iter = NULL; + return iter; +} + +static void vgic_debug_stop(struct seq_file *s, void *v) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter; + + /* + * If the seq file wasn't properly opened, there's nothing to clearn + * up. + */ + if (IS_ERR(v)) + return; + + mutex_lock(&kvm->lock); + iter = kvm->arch.vgic.iter; + kfree(iter->lpi_array); + kfree(iter); + kvm->arch.vgic.iter = NULL; + mutex_unlock(&kvm->lock); +} + +static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) +{ + bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; + + seq_printf(s, "Distributor\n"); + seq_printf(s, "===========\n"); + seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); + seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); + if (v3) + seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count); + seq_printf(s, "enabled:\t%d\n", dist->enabled); + seq_printf(s, "\n"); + + seq_printf(s, "P=pending_latch, L=line_level, A=active\n"); + seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n"); + seq_printf(s, "G=group\n"); +} + +static void print_header(struct seq_file *s, struct vgic_irq *irq, + struct kvm_vcpu *vcpu) +{ + int id = 0; + char *hdr = "SPI "; + + if (vcpu) { + hdr = "VCPU"; + id = vcpu->vcpu_id; + } + + seq_printf(s, "\n"); + seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHCG HWID TARGET SRC PRI VCPU_ID\n", hdr, id); + seq_printf(s, "----------------------------------------------------------------\n"); +} + +static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, + struct kvm_vcpu *vcpu) +{ + char *type; + bool pending; + + if (irq->intid < VGIC_NR_SGIS) + type = "SGI"; + else if (irq->intid < VGIC_NR_PRIVATE_IRQS) + type = "PPI"; + else if (irq->intid < VGIC_MAX_SPI) + type = "SPI"; + else + type = "LPI"; + + if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS) + print_header(s, irq, vcpu); + + pending = irq->pending_latch; + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + int err; + + err = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &pending); + WARN_ON_ONCE(err); + } + + seq_printf(s, " %s %4d " + " %2d " + "%d%d%d%d%d%d%d " + "%8d " + "%8x " + " %2x " + "%3d " + " %2d " + "\n", + type, irq->intid, + (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1, + pending, + irq->line_level, + irq->active, + irq->enabled, + irq->hw, + irq->config == VGIC_CONFIG_LEVEL, + irq->group, + irq->hwintid, + irq->mpidr, + irq->source, + irq->priority, + (irq->vcpu) ? irq->vcpu->vcpu_id : -1); +} + +static int vgic_debug_show(struct seq_file *s, void *v) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter = (struct vgic_state_iter *)v; + struct vgic_irq *irq; + struct kvm_vcpu *vcpu = NULL; + unsigned long flags; + + if (iter->dist_id == 0) { + print_dist_state(s, &kvm->arch.vgic); + return 0; + } + + if (!kvm->arch.vgic.initialized) + return 0; + + if (iter->vcpu_id < iter->nr_cpus) + vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); + + irq = vgic_get_irq(kvm, vcpu, iter->intid); + if (!irq) { + seq_printf(s, " LPI %4d freed\n", iter->intid); + return 0; + } + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + print_irq_state(s, irq, vcpu); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(kvm, irq); + return 0; +} + +static const struct seq_operations vgic_debug_seq_ops = { + .start = vgic_debug_start, + .next = vgic_debug_next, + .stop = vgic_debug_stop, + .show = vgic_debug_show +}; + +static int debug_open(struct inode *inode, struct file *file) +{ + int ret; + ret = seq_open(file, &vgic_debug_seq_ops); + if (!ret) { + struct seq_file *seq; + /* seq_open will have modified file->private_data */ + seq = file->private_data; + seq->private = inode->i_private; + } + + return ret; +}; + +static const struct file_operations vgic_debug_fops = { + .owner = THIS_MODULE, + .open = debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +void vgic_debug_init(struct kvm *kvm) +{ + debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm, + &vgic_debug_fops); +} + +void vgic_debug_destroy(struct kvm *kvm) +{ +} diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c new file mode 100644 index 000000000000..32e32d67a127 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -0,0 +1,556 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "vgic.h" + +/* + * Initialization rules: there are multiple stages to the vgic + * initialization, both for the distributor and the CPU interfaces. The basic + * idea is that even though the VGIC is not functional or not requested from + * user space, the critical path of the run loop can still call VGIC functions + * that just won't do anything, without them having to check additional + * initialization flags to ensure they don't look at uninitialized data + * structures. + * + * Distributor: + * + * - kvm_vgic_early_init(): initialization of static data that doesn't + * depend on any sizing information or emulation type. No allocation + * is allowed there. + * + * - vgic_init(): allocation and initialization of the generic data + * structures that depend on sizing information (number of CPUs, + * number of interrupts). Also initializes the vcpu specific data + * structures. Can be executed lazily for GICv2. + * + * CPU Interface: + * + * - kvm_vgic_vcpu_init(): initialization of static data that + * doesn't depend on any sizing information or emulation type. No + * allocation is allowed there. + */ + +/* EARLY INIT */ + +/** + * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures + * @kvm: The VM whose VGIC districutor should be initialized + * + * Only do initialization of static structures that don't require any + * allocation or sizing information from userspace. vgic_init() called + * kvm_vgic_dist_init() which takes care of the rest. + */ +void kvm_vgic_early_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + INIT_LIST_HEAD(&dist->lpi_list_head); + INIT_LIST_HEAD(&dist->lpi_translation_cache); + raw_spin_lock_init(&dist->lpi_list_lock); +} + +/* CREATION */ + +/** + * kvm_vgic_create: triggered by the instantiation of the VGIC device by + * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) + * or through the generic KVM_CREATE_DEVICE API ioctl. + * irqchip_in_kernel() tells you if this function succeeded or not. + * @kvm: kvm struct pointer + * @type: KVM_DEV_TYPE_ARM_VGIC_V[23] + */ +int kvm_vgic_create(struct kvm *kvm, u32 type) +{ + int i, ret; + struct kvm_vcpu *vcpu; + + if (irqchip_in_kernel(kvm)) + return -EEXIST; + + /* + * This function is also called by the KVM_CREATE_IRQCHIP handler, + * which had no chance yet to check the availability of the GICv2 + * emulation. So check this here again. KVM_CREATE_DEVICE does + * the proper checks already. + */ + if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && + !kvm_vgic_global_state.can_emulate_gicv2) + return -ENODEV; + + ret = -EBUSY; + if (!lock_all_vcpus(kvm)) + return ret; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.has_run_once) + goto out_unlock; + } + ret = 0; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS; + else + kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS; + + if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) { + ret = -E2BIG; + goto out_unlock; + } + + kvm->arch.vgic.in_kernel = true; + kvm->arch.vgic.vgic_model = type; + + kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; + else + INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); + +out_unlock: + unlock_all_vcpus(kvm); + return ret; +} + +/* INIT/DESTROY */ + +/** + * kvm_vgic_dist_init: initialize the dist data structures + * @kvm: kvm struct pointer + * @nr_spis: number of spis, frozen by caller + */ +static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); + int i; + + dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); + if (!dist->spis) + return -ENOMEM; + + /* + * In the following code we do not take the irq struct lock since + * no other action on irq structs can happen while the VGIC is + * not initialized yet: + * If someone wants to inject an interrupt or does a MMIO access, we + * require prior initialization in case of a virtual GICv3 or trigger + * initialization when using a virtual GICv2. + */ + for (i = 0; i < nr_spis; i++) { + struct vgic_irq *irq = &dist->spis[i]; + + irq->intid = i + VGIC_NR_PRIVATE_IRQS; + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + irq->vcpu = NULL; + irq->target_vcpu = vcpu0; + kref_init(&irq->refcount); + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->targets = 0; + irq->group = 0; + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->mpidr = 0; + irq->group = 1; + break; + default: + kfree(dist->spis); + dist->spis = NULL; + return -EINVAL; + } + } + return 0; +} + +/** + * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data + * structures and register VCPU-specific KVM iodevs + * + * @vcpu: pointer to the VCPU being created and initialized + * + * Only do initialization, but do not actually enable the + * VGIC CPU interface + */ +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int ret = 0; + int i; + + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + raw_spin_lock_init(&vgic_cpu->ap_list_lock); + atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + + /* + * Enable and configure all SGIs to be edge-triggered and + * configure all PPIs as level-triggered. + */ + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + irq->intid = i; + irq->vcpu = NULL; + irq->target_vcpu = vcpu; + kref_init(&irq->refcount); + if (vgic_irq_is_sgi(i)) { + /* SGIs */ + irq->enabled = 1; + irq->config = VGIC_CONFIG_EDGE; + } else { + /* PPIs */ + irq->config = VGIC_CONFIG_LEVEL; + } + } + + if (!irqchip_in_kernel(vcpu->kvm)) + return 0; + + /* + * If we are creating a VCPU with a GICv3 we must also register the + * KVM io device for the redistributor that belongs to this VCPU. + */ + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + mutex_lock(&vcpu->kvm->lock); + ret = vgic_register_redist_iodev(vcpu); + mutex_unlock(&vcpu->kvm->lock); + } + return ret; +} + +static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_enable(vcpu); + else + vgic_v3_enable(vcpu); +} + +/* + * vgic_init: allocates and initializes dist and vcpu data structures + * depending on two dimensioning parameters: + * - the number of spis + * - the number of vcpus + * The function is generally called when nr_spis has been explicitly set + * by the guest through the KVM DEVICE API. If not nr_spis is set to 256. + * vgic_initialized() returns true when this function has succeeded. + * Must be called with kvm->lock held! + */ +int vgic_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int ret = 0, i, idx; + + if (vgic_initialized(kvm)) + return 0; + + /* Are we also in the middle of creating a VCPU? */ + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) + return -EBUSY; + + /* freeze the number of spis */ + if (!dist->nr_spis) + dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; + + ret = kvm_vgic_dist_init(kvm, dist->nr_spis); + if (ret) + goto out; + + /* Initialize groups on CPUs created before the VGIC type was known */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->group = 1; + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->group = 0; + irq->targets = 1U << idx; + break; + default: + ret = -EINVAL; + goto out; + } + } + } + + if (vgic_has_its(kvm)) + vgic_lpi_translation_cache_init(kvm); + + /* + * If we have GICv4.1 enabled, unconditionnaly request enable the + * v4 support so that we get HW-accelerated vSGIs. Otherwise, only + * enable it if we present a virtual ITS to the guest. + */ + if (vgic_supports_direct_msis(kvm)) { + ret = vgic_v4_init(kvm); + if (ret) + goto out; + } + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_enable(vcpu); + + ret = kvm_vgic_setup_default_irq_routing(kvm); + if (ret) + goto out; + + vgic_debug_init(kvm); + + dist->implementation_rev = 2; + dist->initialized = true; + +out: + return ret; +} + +static void kvm_vgic_dist_destroy(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_redist_region *rdreg, *next; + + dist->ready = false; + dist->initialized = false; + + kfree(dist->spis); + dist->spis = NULL; + dist->nr_spis = 0; + + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) { + list_del(&rdreg->list); + kfree(rdreg); + } + INIT_LIST_HEAD(&dist->rd_regions); + } + + if (vgic_has_its(kvm)) + vgic_lpi_translation_cache_destroy(kvm); + + if (vgic_supports_direct_msis(kvm)) + vgic_v4_teardown(kvm); +} + +void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + /* + * Retire all pending LPIs on this vcpu anyway as we're + * going to destroy it. + */ + vgic_flush_pending_lpis(vcpu); + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); +} + +/* To be called with kvm->lock held */ +static void __kvm_vgic_destroy(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i; + + vgic_debug_destroy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_destroy(vcpu); + + kvm_vgic_dist_destroy(kvm); +} + +void kvm_vgic_destroy(struct kvm *kvm) +{ + mutex_lock(&kvm->lock); + __kvm_vgic_destroy(kvm); + mutex_unlock(&kvm->lock); +} + +/** + * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest + * is a GICv2. A GICv3 must be explicitly initialized by the guest using the + * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group. + * @kvm: kvm struct pointer + */ +int vgic_lazy_init(struct kvm *kvm) +{ + int ret = 0; + + if (unlikely(!vgic_initialized(kvm))) { + /* + * We only provide the automatic initialization of the VGIC + * for the legacy case of a GICv2. Any other type must + * be explicitly initialized once setup with the respective + * KVM device call. + */ + if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) + return -EBUSY; + + mutex_lock(&kvm->lock); + ret = vgic_init(kvm); + mutex_unlock(&kvm->lock); + } + + return ret; +} + +/* RESOURCE MAPPING */ + +/** + * Map the MMIO regions depending on the VGIC model exposed to the guest + * called on the first VCPU run. + * Also map the virtual CPU interface into the VM. + * v2/v3 derivatives call vgic_init if not already done. + * vgic_ready() returns true if this function has succeeded. + * @kvm: kvm struct pointer + */ +int kvm_vgic_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret = 0; + + mutex_lock(&kvm->lock); + if (!irqchip_in_kernel(kvm)) + goto out; + + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) + ret = vgic_v2_map_resources(kvm); + else + ret = vgic_v3_map_resources(kvm); + + if (ret) + __kvm_vgic_destroy(kvm); + +out: + mutex_unlock(&kvm->lock); + return ret; +} + +/* GENERIC PROBE */ + +static int vgic_init_cpu_starting(unsigned int cpu) +{ + enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); + return 0; +} + + +static int vgic_init_cpu_dying(unsigned int cpu) +{ + disable_percpu_irq(kvm_vgic_global_state.maint_irq); + return 0; +} + +static irqreturn_t vgic_maintenance_handler(int irq, void *data) +{ + /* + * We cannot rely on the vgic maintenance interrupt to be + * delivered synchronously. This means we can only use it to + * exit the VM, and we perform the handling of EOIed + * interrupts on the exit path (see vgic_fold_lr_state). + */ + return IRQ_HANDLED; +} + +/** + * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware + * + * For a specific CPU, initialize the GIC VE hardware. + */ +void kvm_vgic_init_cpu_hardware(void) +{ + BUG_ON(preemptible()); + + /* + * We want to make sure the list registers start out clear so that we + * only have the program the used registers. + */ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_init_lrs(); + else + kvm_call_hyp(__vgic_v3_init_lrs); +} + +/** + * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable + * according to the host GIC model. Accordingly calls either + * vgic_v2/v3_probe which registers the KVM_DEVICE that can be + * instantiated by a guest later on . + */ +int kvm_vgic_hyp_init(void) +{ + const struct gic_kvm_info *gic_kvm_info; + int ret; + + gic_kvm_info = gic_get_kvm_info(); + if (!gic_kvm_info) + return -ENODEV; + + if (!gic_kvm_info->maint_irq) { + kvm_err("No vgic maintenance irq\n"); + return -ENXIO; + } + + switch (gic_kvm_info->type) { + case GIC_V2: + ret = vgic_v2_probe(gic_kvm_info); + break; + case GIC_V3: + ret = vgic_v3_probe(gic_kvm_info); + if (!ret) { + static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); + kvm_info("GIC system register CPU interface enabled\n"); + } + break; + default: + ret = -ENODEV; + } + + if (ret) + return ret; + + kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq; + ret = request_percpu_irq(kvm_vgic_global_state.maint_irq, + vgic_maintenance_handler, + "vgic", kvm_get_running_vcpus()); + if (ret) { + kvm_err("Cannot register interrupt %d\n", + kvm_vgic_global_state.maint_irq); + return ret; + } + + ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, + "kvm/arm/vgic:starting", + vgic_init_cpu_starting, vgic_init_cpu_dying); + if (ret) { + kvm_err("Cannot register vgic CPU notifier\n"); + goto out_free_irq; + } + + kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); + return 0; + +out_free_irq: + free_percpu_irq(kvm_vgic_global_state.maint_irq, + kvm_get_running_vcpus()); + return ret; +} diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c new file mode 100644 index 000000000000..d8cdfea5cc96 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include +#include +#include +#include +#include "vgic.h" + +/** + * vgic_irqfd_set_irq: inject the IRQ corresponding to the + * irqchip routing entry + * + * This is the entry point for irqfd IRQ injection + */ +static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS; + + if (!vgic_valid_spi(kvm, spi_id)) + return -EINVAL; + return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL); +} + +/** + * kvm_set_routing_entry: populate a kvm routing entry + * from a user routing entry + * + * @kvm: the VM this entry is applied to + * @e: kvm kernel routing entry handle + * @ue: user api routing entry handle + * return 0 on success, -EINVAL on errors. + */ +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = vgic_irqfd_set_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) || + (e->irqchip.irqchip >= KVM_NR_IRQCHIPS)) + goto out; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + e->msi.flags = ue->flags; + e->msi.devid = ue->u.msi.devid; + break; + default: + goto out; + } + r = 0; +out: + return r; +} + +static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm_msi *msi) +{ + msi->address_lo = e->msi.address_lo; + msi->address_hi = e->msi.address_hi; + msi->data = e->msi.data; + msi->flags = e->msi.flags; + msi->devid = e->msi.devid; +} +/** + * kvm_set_msi: inject the MSI corresponding to the + * MSI routing entry + * + * This is the entry point for irqfd MSI injection + * and userspace MSI injection. + */ +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + struct kvm_msi msi; + + if (!vgic_has_its(kvm)) + return -ENODEV; + + if (!level) + return -1; + + kvm_populate_msi(e, &msi); + return vgic_its_inject_msi(kvm, &msi); +} + +/** + * kvm_arch_set_irq_inatomic: fast-path for irqfd injection + * + * Currently only direct MSI injection is supported. + */ +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + if (e->type == KVM_IRQ_ROUTING_MSI && vgic_has_its(kvm) && level) { + struct kvm_msi msi; + + kvm_populate_msi(e, &msi); + if (!vgic_its_inject_cached_translation(kvm, &msi)) + return 0; + } + + return -EWOULDBLOCK; +} + +int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) +{ + struct kvm_irq_routing_entry *entries; + struct vgic_dist *dist = &kvm->arch.vgic; + u32 nr = dist->nr_spis; + int i, ret; + + entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL); + if (!entries) + return -ENOMEM; + + for (i = 0; i < nr; i++) { + entries[i].gsi = i; + entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; + entries[i].u.irqchip.irqchip = 0; + entries[i].u.irqchip.pin = i; + } + ret = kvm_set_irq_routing(kvm, entries, nr, 0); + kfree(entries); + return ret; +} diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c new file mode 100644 index 000000000000..c012a52b19f5 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -0,0 +1,2783 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GICv3 ITS emulation + * + * Copyright (C) 2015,2016 ARM Ltd. + * Author: Andre Przywara + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "vgic.h" +#include "vgic-mmio.h" + +static int vgic_its_save_tables_v0(struct vgic_its *its); +static int vgic_its_restore_tables_v0(struct vgic_its *its); +static int vgic_its_commit_v0(struct vgic_its *its); +static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, + struct kvm_vcpu *filter_vcpu, bool needs_inv); + +/* + * Creates a new (reference to a) struct vgic_irq for a given LPI. + * If this LPI is already mapped on another ITS, we increase its refcount + * and return a pointer to the existing structure. + * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq. + * This function returns a pointer to the _unlocked_ structure. + */ +static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, + struct kvm_vcpu *vcpu) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq; + unsigned long flags; + int ret; + + /* In this case there is no put, since we keep the reference. */ + if (irq) + return irq; + + irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); + if (!irq) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&irq->lpi_list); + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + + irq->config = VGIC_CONFIG_EDGE; + kref_init(&irq->refcount); + irq->intid = intid; + irq->target_vcpu = vcpu; + irq->group = 1; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + + /* + * There could be a race with another vgic_add_lpi(), so we need to + * check that we don't add a second list entry with the same LPI. + */ + list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { + if (oldirq->intid != intid) + continue; + + /* Someone was faster with adding this LPI, lets use that. */ + kfree(irq); + irq = oldirq; + + /* + * This increases the refcount, the caller is expected to + * call vgic_put_irq() on the returned pointer once it's + * finished with the IRQ. + */ + vgic_get_irq_kref(irq); + + goto out_unlock; + } + + list_add_tail(&irq->lpi_list, &dist->lpi_list_head); + dist->lpi_list_count++; + +out_unlock: + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + + /* + * We "cache" the configuration table entries in our struct vgic_irq's. + * However we only have those structs for mapped IRQs, so we read in + * the respective config data from memory here upon mapping the LPI. + * + * Should any of these fail, behave as if we couldn't create the LPI + * by dropping the refcount and returning the error. + */ + ret = update_lpi_config(kvm, irq, NULL, false); + if (ret) { + vgic_put_irq(kvm, irq); + return ERR_PTR(ret); + } + + ret = vgic_v3_lpi_sync_pending_status(kvm, irq); + if (ret) { + vgic_put_irq(kvm, irq); + return ERR_PTR(ret); + } + + return irq; +} + +struct its_device { + struct list_head dev_list; + + /* the head for the list of ITTEs */ + struct list_head itt_head; + u32 num_eventid_bits; + gpa_t itt_addr; + u32 device_id; +}; + +#define COLLECTION_NOT_MAPPED ((u32)~0) + +struct its_collection { + struct list_head coll_list; + + u32 collection_id; + u32 target_addr; +}; + +#define its_is_collection_mapped(coll) ((coll) && \ + ((coll)->target_addr != COLLECTION_NOT_MAPPED)) + +struct its_ite { + struct list_head ite_list; + + struct vgic_irq *irq; + struct its_collection *collection; + u32 event_id; +}; + +struct vgic_translation_cache_entry { + struct list_head entry; + phys_addr_t db; + u32 devid; + u32 eventid; + struct vgic_irq *irq; +}; + +/** + * struct vgic_its_abi - ITS abi ops and settings + * @cte_esz: collection table entry size + * @dte_esz: device table entry size + * @ite_esz: interrupt translation table entry size + * @save tables: save the ITS tables into guest RAM + * @restore_tables: restore the ITS internal structs from tables + * stored in guest RAM + * @commit: initialize the registers which expose the ABI settings, + * especially the entry sizes + */ +struct vgic_its_abi { + int cte_esz; + int dte_esz; + int ite_esz; + int (*save_tables)(struct vgic_its *its); + int (*restore_tables)(struct vgic_its *its); + int (*commit)(struct vgic_its *its); +}; + +#define ABI_0_ESZ 8 +#define ESZ_MAX ABI_0_ESZ + +static const struct vgic_its_abi its_table_abi_versions[] = { + [0] = { + .cte_esz = ABI_0_ESZ, + .dte_esz = ABI_0_ESZ, + .ite_esz = ABI_0_ESZ, + .save_tables = vgic_its_save_tables_v0, + .restore_tables = vgic_its_restore_tables_v0, + .commit = vgic_its_commit_v0, + }, +}; + +#define NR_ITS_ABIS ARRAY_SIZE(its_table_abi_versions) + +inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its) +{ + return &its_table_abi_versions[its->abi_rev]; +} + +static int vgic_its_set_abi(struct vgic_its *its, u32 rev) +{ + const struct vgic_its_abi *abi; + + its->abi_rev = rev; + abi = vgic_its_get_abi(its); + return abi->commit(its); +} + +/* + * Find and returns a device in the device table for an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_device *find_its_device(struct vgic_its *its, u32 device_id) +{ + struct its_device *device; + + list_for_each_entry(device, &its->device_list, dev_list) + if (device_id == device->device_id) + return device; + + return NULL; +} + +/* + * Find and returns an interrupt translation table entry (ITTE) for a given + * Device ID/Event ID pair on an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, + u32 event_id) +{ + struct its_device *device; + struct its_ite *ite; + + device = find_its_device(its, device_id); + if (device == NULL) + return NULL; + + list_for_each_entry(ite, &device->itt_head, ite_list) + if (ite->event_id == event_id) + return ite; + + return NULL; +} + +/* To be used as an iterator this macro misses the enclosing parentheses */ +#define for_each_lpi_its(dev, ite, its) \ + list_for_each_entry(dev, &(its)->device_list, dev_list) \ + list_for_each_entry(ite, &(dev)->itt_head, ite_list) + +#define GIC_LPI_OFFSET 8192 + +#define VITS_TYPER_IDBITS 16 +#define VITS_TYPER_DEVBITS 16 +#define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1) +#define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1) + +/* + * Finds and returns a collection in the ITS collection table. + * Must be called with the its_lock mutex held. + */ +static struct its_collection *find_collection(struct vgic_its *its, int coll_id) +{ + struct its_collection *collection; + + list_for_each_entry(collection, &its->collection_list, coll_list) { + if (coll_id == collection->collection_id) + return collection; + } + + return NULL; +} + +#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) +#define LPI_PROP_PRIORITY(p) ((p) & 0xfc) + +/* + * Reads the configuration data for a given LPI from guest memory and + * updates the fields in struct vgic_irq. + * If filter_vcpu is not NULL, applies only if the IRQ is targeting this + * VCPU. Unconditionally applies if filter_vcpu is NULL. + */ +static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, + struct kvm_vcpu *filter_vcpu, bool needs_inv) +{ + u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); + u8 prop; + int ret; + unsigned long flags; + + ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET, + &prop, 1); + + if (ret) + return ret; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { + irq->priority = LPI_PROP_PRIORITY(prop); + irq->enabled = LPI_PROP_ENABLE_BIT(prop); + + if (!irq->hw) { + vgic_queue_irq_unlock(kvm, irq, flags); + return 0; + } + } + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + if (irq->hw) + return its_prop_update_vlpi(irq->host_irq, prop, needs_inv); + + return 0; +} + +/* + * Create a snapshot of the current LPIs targeting @vcpu, so that we can + * enumerate those LPIs without holding any lock. + * Returns their number and puts the kmalloc'ed array into intid_ptr. + */ +int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + unsigned long flags; + u32 *intids; + int irq_count, i = 0; + + /* + * There is an obvious race between allocating the array and LPIs + * being mapped/unmapped. If we ended up here as a result of a + * command, we're safe (locks are held, preventing another + * command). If coming from another path (such as enabling LPIs), + * we must be careful not to overrun the array. + */ + irq_count = READ_ONCE(dist->lpi_list_count); + intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); + if (!intids) + return -ENOMEM; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + if (i == irq_count) + break; + /* We don't need to "get" the IRQ, as we hold the list lock. */ + if (vcpu && irq->target_vcpu != vcpu) + continue; + intids[i++] = irq->intid; + } + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + + *intid_ptr = intids; + return i; +} + +static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) +{ + int ret = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->target_vcpu = vcpu; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + if (irq->hw) { + struct its_vlpi_map map; + + ret = its_get_vlpi(irq->host_irq, &map); + if (ret) + return ret; + + if (map.vpe) + atomic_dec(&map.vpe->vlpi_count); + map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + atomic_inc(&map.vpe->vlpi_count); + + ret = its_map_vlpi(irq->host_irq, &map); + } + + return ret; +} + +/* + * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI + * is targeting) to the VGIC's view, which deals with target VCPUs. + * Needs to be called whenever either the collection for a LPIs has + * changed or the collection itself got retargeted. + */ +static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite) +{ + struct kvm_vcpu *vcpu; + + if (!its_is_collection_mapped(ite->collection)) + return; + + vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); + update_affinity(ite->irq, vcpu); +} + +/* + * Updates the target VCPU for every LPI targeting this collection. + * Must be called with the its_lock mutex held. + */ +static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, + struct its_collection *coll) +{ + struct its_device *device; + struct its_ite *ite; + + for_each_lpi_its(device, ite, its) { + if (!ite->collection || coll != ite->collection) + continue; + + update_affinity_ite(kvm, ite); + } +} + +static u32 max_lpis_propbaser(u64 propbaser) +{ + int nr_idbits = (propbaser & 0x1f) + 1; + + return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS); +} + +/* + * Sync the pending table pending bit of LPIs targeting @vcpu + * with our own data structures. This relies on the LPI being + * mapped before. + */ +static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) +{ + gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + struct vgic_irq *irq; + int last_byte_offset = -1; + int ret = 0; + u32 *intids; + int nr_irqs, i; + unsigned long flags; + u8 pendmask; + + nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids); + if (nr_irqs < 0) + return nr_irqs; + + for (i = 0; i < nr_irqs; i++) { + int byte_offset, bit_nr; + + byte_offset = intids[i] / BITS_PER_BYTE; + bit_nr = intids[i] % BITS_PER_BYTE; + + /* + * For contiguously allocated LPIs chances are we just read + * this very same byte in the last iteration. Reuse that. + */ + if (byte_offset != last_byte_offset) { + ret = kvm_read_guest_lock(vcpu->kvm, + pendbase + byte_offset, + &pendmask, 1); + if (ret) { + kfree(intids); + return ret; + } + last_byte_offset = byte_offset; + } + + irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = pendmask & (1U << bit_nr); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + vgic_put_irq(vcpu->kvm, irq); + } + + kfree(intids); + + return ret; +} + +static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 reg = GITS_TYPER_PLPIS; + + /* + * We use linear CPU numbers for redistributor addressing, + * so GITS_TYPER.PTA is 0. + * Also we force all PROPBASER registers to be the same, so + * CommonLPIAff is 0 as well. + * To avoid memory waste in the guest, we keep the number of IDBits and + * DevBits low - as least for the time being. + */ + reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT; + reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT; + reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT; + + return extract_bytes(reg, addr & 7, len); +} + +static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u32 val; + + val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK; + val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM; + return val; +} + +static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 rev = GITS_IIDR_REV(val); + + if (rev >= NR_ITS_ABIS) + return -EINVAL; + return vgic_its_set_abi(its, rev); +} + +static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + switch (addr & 0xffff) { + case GITS_PIDR0: + return 0x92; /* part number, bits[7:0] */ + case GITS_PIDR1: + return 0xb4; /* part number, bits[11:8] */ + case GITS_PIDR2: + return GIC_PIDR2_ARCH_GICv3 | 0x0b; + case GITS_PIDR4: + return 0x40; /* This is a 64K software visible page */ + /* The following are the ID registers for (any) GIC. */ + case GITS_CIDR0: + return 0x0d; + case GITS_CIDR1: + return 0xf0; + case GITS_CIDR2: + return 0x05; + case GITS_CIDR3: + return 0xb1; + } + + return 0; +} + +static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist, + phys_addr_t db, + u32 devid, u32 eventid) +{ + struct vgic_translation_cache_entry *cte; + + list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { + /* + * If we hit a NULL entry, there is nothing after this + * point. + */ + if (!cte->irq) + break; + + if (cte->db != db || cte->devid != devid || + cte->eventid != eventid) + continue; + + /* + * Move this entry to the head, as it is the most + * recently used. + */ + if (!list_is_first(&cte->entry, &dist->lpi_translation_cache)) + list_move(&cte->entry, &dist->lpi_translation_cache); + + return cte->irq; + } + + return NULL; +} + +static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, + u32 devid, u32 eventid) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + unsigned long flags; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + irq = __vgic_its_check_cache(dist, db, devid, eventid); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + + return irq; +} + +static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, + struct vgic_irq *irq) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_translation_cache_entry *cte; + unsigned long flags; + phys_addr_t db; + + /* Do not cache a directly injected interrupt */ + if (irq->hw) + return; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + + if (unlikely(list_empty(&dist->lpi_translation_cache))) + goto out; + + /* + * We could have raced with another CPU caching the same + * translation behind our back, so let's check it is not in + * already + */ + db = its->vgic_its_base + GITS_TRANSLATER; + if (__vgic_its_check_cache(dist, db, devid, eventid)) + goto out; + + /* Always reuse the last entry (LRU policy) */ + cte = list_last_entry(&dist->lpi_translation_cache, + typeof(*cte), entry); + + /* + * Caching the translation implies having an extra reference + * to the interrupt, so drop the potential reference on what + * was in the cache, and increment it on the new interrupt. + */ + if (cte->irq) + __vgic_put_lpi_locked(kvm, cte->irq); + + vgic_get_irq_kref(irq); + + cte->db = db; + cte->devid = devid; + cte->eventid = eventid; + cte->irq = irq; + + /* Move the new translation to the head of the list */ + list_move(&cte->entry, &dist->lpi_translation_cache); + +out: + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); +} + +void vgic_its_invalidate_cache(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_translation_cache_entry *cte; + unsigned long flags; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + + list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { + /* + * If we hit a NULL entry, there is nothing after this + * point. + */ + if (!cte->irq) + break; + + __vgic_put_lpi_locked(kvm, cte->irq); + cte->irq = NULL; + } + + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); +} + +int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, struct vgic_irq **irq) +{ + struct kvm_vcpu *vcpu; + struct its_ite *ite; + + if (!its->enabled) + return -EBUSY; + + ite = find_ite(its, devid, eventid); + if (!ite || !its_is_collection_mapped(ite->collection)) + return E_ITS_INT_UNMAPPED_INTERRUPT; + + vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); + if (!vcpu) + return E_ITS_INT_UNMAPPED_INTERRUPT; + + if (!vcpu->arch.vgic_cpu.lpis_enabled) + return -EBUSY; + + vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq); + + *irq = ite->irq; + return 0; +} + +struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) +{ + u64 address; + struct kvm_io_device *kvm_io_dev; + struct vgic_io_device *iodev; + + if (!vgic_has_its(kvm)) + return ERR_PTR(-ENODEV); + + if (!(msi->flags & KVM_MSI_VALID_DEVID)) + return ERR_PTR(-EINVAL); + + address = (u64)msi->address_hi << 32 | msi->address_lo; + + kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); + if (!kvm_io_dev) + return ERR_PTR(-EINVAL); + + if (kvm_io_dev->ops != &kvm_io_gic_ops) + return ERR_PTR(-EINVAL); + + iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); + if (iodev->iodev_type != IODEV_ITS) + return ERR_PTR(-EINVAL); + + return iodev->its; +} + +/* + * Find the target VCPU and the LPI number for a given devid/eventid pair + * and make this IRQ pending, possibly injecting it. + * Must be called with the its_lock mutex held. + * Returns 0 on success, a positive error value for any ITS mapping + * related errors and negative error values for generic errors. + */ +static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid) +{ + struct vgic_irq *irq = NULL; + unsigned long flags; + int err; + + err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq); + if (err) + return err; + + if (irq->hw) + return irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, true); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + vgic_queue_irq_unlock(kvm, irq, flags); + + return 0; +} + +int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) +{ + struct vgic_irq *irq; + unsigned long flags; + phys_addr_t db; + + db = (u64)msi->address_hi << 32 | msi->address_lo; + irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data); + + if (!irq) + return -1; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + vgic_queue_irq_unlock(kvm, irq, flags); + + return 0; +} + +/* + * Queries the KVM IO bus framework to get the ITS pointer from the given + * doorbell address. + * We then call vgic_its_trigger_msi() with the decoded data. + * According to the KVM_SIGNAL_MSI API description returns 1 on success. + */ +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + struct vgic_its *its; + int ret; + + if (!vgic_its_inject_cached_translation(kvm, msi)) + return 1; + + its = vgic_msi_to_its(kvm, msi); + if (IS_ERR(its)) + return PTR_ERR(its); + + mutex_lock(&its->its_lock); + ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data); + mutex_unlock(&its->its_lock); + + if (ret < 0) + return ret; + + /* + * KVM_SIGNAL_MSI demands a return value > 0 for success and 0 + * if the guest has blocked the MSI. So we map any LPI mapping + * related error to that. + */ + if (ret) + return 0; + else + return 1; +} + +/* Requires the its_lock to be held. */ +static void its_free_ite(struct kvm *kvm, struct its_ite *ite) +{ + list_del(&ite->ite_list); + + /* This put matches the get in vgic_add_lpi. */ + if (ite->irq) { + if (ite->irq->hw) + WARN_ON(its_unmap_vlpi(ite->irq->host_irq)); + + vgic_put_irq(kvm, ite->irq); + } + + kfree(ite); +} + +static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size) +{ + return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1); +} + +#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8) +#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32) +#define its_cmd_get_size(cmd) (its_cmd_mask_field(cmd, 1, 0, 5) + 1) +#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32) +#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32) +#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16) +#define its_cmd_get_ittaddr(cmd) (its_cmd_mask_field(cmd, 2, 8, 44) << 8) +#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32) +#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1) + +/* + * The DISCARD command frees an Interrupt Translation Table Entry (ITTE). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_ite *ite; + + ite = find_ite(its, device_id, event_id); + if (ite && its_is_collection_mapped(ite->collection)) { + /* + * Though the spec talks about removing the pending state, we + * don't bother here since we clear the ITTE anyway and the + * pending state is a property of the ITTE struct. + */ + vgic_its_invalidate_cache(kvm); + + its_free_ite(kvm, ite); + return 0; + } + + return E_ITS_DISCARD_UNMAPPED_INTERRUPT; +} + +/* + * The MOVI command moves an ITTE to a different collection. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct kvm_vcpu *vcpu; + struct its_ite *ite; + struct its_collection *collection; + + ite = find_ite(its, device_id, event_id); + if (!ite) + return E_ITS_MOVI_UNMAPPED_INTERRUPT; + + if (!its_is_collection_mapped(ite->collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + ite->collection = collection; + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + vgic_its_invalidate_cache(kvm); + + return update_affinity(ite->irq, vcpu); +} + +/* + * Check whether an ID can be stored into the corresponding guest table. + * For a direct table this is pretty easy, but gets a bit nasty for + * indirect tables. We check whether the resulting guest physical address + * is actually valid (covered by a memslot and guest accessible). + * For this we have to read the respective first level entry. + */ +static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, + gpa_t *eaddr) +{ + int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + u64 indirect_ptr, type = GITS_BASER_TYPE(baser); + phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); + int esz = GITS_BASER_ENTRY_SIZE(baser); + int index, idx; + gfn_t gfn; + bool ret; + + switch (type) { + case GITS_BASER_TYPE_DEVICE: + if (id >= BIT_ULL(VITS_TYPER_DEVBITS)) + return false; + break; + case GITS_BASER_TYPE_COLLECTION: + /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */ + if (id >= BIT_ULL(16)) + return false; + break; + default: + return false; + } + + if (!(baser & GITS_BASER_INDIRECT)) { + phys_addr_t addr; + + if (id >= (l1_tbl_size / esz)) + return false; + + addr = base + id * esz; + gfn = addr >> PAGE_SHIFT; + + if (eaddr) + *eaddr = addr; + + goto out; + } + + /* calculate and check the index into the 1st level */ + index = id / (SZ_64K / esz); + if (index >= (l1_tbl_size / sizeof(u64))) + return false; + + /* Each 1st level entry is represented by a 64-bit value. */ + if (kvm_read_guest_lock(its->dev->kvm, + base + index * sizeof(indirect_ptr), + &indirect_ptr, sizeof(indirect_ptr))) + return false; + + indirect_ptr = le64_to_cpu(indirect_ptr); + + /* check the valid bit of the first level entry */ + if (!(indirect_ptr & BIT_ULL(63))) + return false; + + /* Mask the guest physical address and calculate the frame number. */ + indirect_ptr &= GENMASK_ULL(51, 16); + + /* Find the address of the actual entry */ + index = id % (SZ_64K / esz); + indirect_ptr += index * esz; + gfn = indirect_ptr >> PAGE_SHIFT; + + if (eaddr) + *eaddr = indirect_ptr; + +out: + idx = srcu_read_lock(&its->dev->kvm->srcu); + ret = kvm_is_visible_gfn(its->dev->kvm, gfn); + srcu_read_unlock(&its->dev->kvm->srcu, idx); + return ret; +} + +static int vgic_its_alloc_collection(struct vgic_its *its, + struct its_collection **colp, + u32 coll_id) +{ + struct its_collection *collection; + + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) + return E_ITS_MAPC_COLLECTION_OOR; + + collection = kzalloc(sizeof(*collection), GFP_KERNEL); + if (!collection) + return -ENOMEM; + + collection->collection_id = coll_id; + collection->target_addr = COLLECTION_NOT_MAPPED; + + list_add_tail(&collection->coll_list, &its->collection_list); + *colp = collection; + + return 0; +} + +static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) +{ + struct its_collection *collection; + struct its_device *device; + struct its_ite *ite; + + /* + * Clearing the mapping for that collection ID removes the + * entry from the list. If there wasn't any before, we can + * go home early. + */ + collection = find_collection(its, coll_id); + if (!collection) + return; + + for_each_lpi_its(device, ite, its) + if (ite->collection && + ite->collection->collection_id == coll_id) + ite->collection = NULL; + + list_del(&collection->coll_list); + kfree(collection); +} + +/* Must be called with its_lock mutex held */ +static struct its_ite *vgic_its_alloc_ite(struct its_device *device, + struct its_collection *collection, + u32 event_id) +{ + struct its_ite *ite; + + ite = kzalloc(sizeof(*ite), GFP_KERNEL); + if (!ite) + return ERR_PTR(-ENOMEM); + + ite->event_id = event_id; + ite->collection = collection; + + list_add_tail(&ite->ite_list, &device->itt_head); + return ite; +} + +/* + * The MAPTI and MAPI commands map LPIs to ITTEs. + * Must be called with its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_ite *ite; + struct kvm_vcpu *vcpu = NULL; + struct its_device *device; + struct its_collection *collection, *new_coll = NULL; + struct vgic_irq *irq; + int lpi_nr; + + device = find_its_device(its, device_id); + if (!device) + return E_ITS_MAPTI_UNMAPPED_DEVICE; + + if (event_id >= BIT_ULL(device->num_eventid_bits)) + return E_ITS_MAPTI_ID_OOR; + + if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) + lpi_nr = its_cmd_get_physical_id(its_cmd); + else + lpi_nr = event_id; + if (lpi_nr < GIC_LPI_OFFSET || + lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) + return E_ITS_MAPTI_PHYSICALID_OOR; + + /* If there is an existing mapping, behavior is UNPREDICTABLE. */ + if (find_ite(its, device_id, event_id)) + return 0; + + collection = find_collection(its, coll_id); + if (!collection) { + int ret = vgic_its_alloc_collection(its, &collection, coll_id); + if (ret) + return ret; + new_coll = collection; + } + + ite = vgic_its_alloc_ite(device, collection, event_id); + if (IS_ERR(ite)) { + if (new_coll) + vgic_its_free_collection(its, coll_id); + return PTR_ERR(ite); + } + + if (its_is_collection_mapped(collection)) + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + irq = vgic_add_lpi(kvm, lpi_nr, vcpu); + if (IS_ERR(irq)) { + if (new_coll) + vgic_its_free_collection(its, coll_id); + its_free_ite(kvm, ite); + return PTR_ERR(irq); + } + ite->irq = irq; + + return 0; +} + +/* Requires the its_lock to be held. */ +static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) +{ + struct its_ite *ite, *temp; + + /* + * The spec says that unmapping a device with still valid + * ITTEs associated is UNPREDICTABLE. We remove all ITTEs, + * since we cannot leave the memory unreferenced. + */ + list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list) + its_free_ite(kvm, ite); + + vgic_its_invalidate_cache(kvm); + + list_del(&device->dev_list); + kfree(device); +} + +/* its lock must be held */ +static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) +{ + struct its_device *cur, *temp; + + list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) + vgic_its_free_device(kvm, cur); +} + +/* its lock must be held */ +static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its) +{ + struct its_collection *cur, *temp; + + list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list) + vgic_its_free_collection(its, cur->collection_id); +} + +/* Must be called with its_lock mutex held */ +static struct its_device *vgic_its_alloc_device(struct vgic_its *its, + u32 device_id, gpa_t itt_addr, + u8 num_eventid_bits) +{ + struct its_device *device; + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) + return ERR_PTR(-ENOMEM); + + device->device_id = device_id; + device->itt_addr = itt_addr; + device->num_eventid_bits = num_eventid_bits; + INIT_LIST_HEAD(&device->itt_head); + + list_add_tail(&device->dev_list, &its->device_list); + return device; +} + +/* + * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + bool valid = its_cmd_get_validbit(its_cmd); + u8 num_eventid_bits = its_cmd_get_size(its_cmd); + gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); + struct its_device *device; + + if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL)) + return E_ITS_MAPD_DEVICE_OOR; + + if (valid && num_eventid_bits > VITS_TYPER_IDBITS) + return E_ITS_MAPD_ITTSIZE_OOR; + + device = find_its_device(its, device_id); + + /* + * The spec says that calling MAPD on an already mapped device + * invalidates all cached data for this device. We implement this + * by removing the mapping and re-establishing it. + */ + if (device) + vgic_its_free_device(kvm, device); + + /* + * The spec does not say whether unmapping a not-mapped device + * is an error, so we are done in any case. + */ + if (!valid) + return 0; + + device = vgic_its_alloc_device(its, device_id, itt_addr, + num_eventid_bits); + + return PTR_ERR_OR_ZERO(device); +} + +/* + * The MAPC command maps collection IDs to redistributors. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u16 coll_id; + u32 target_addr; + struct its_collection *collection; + bool valid; + + valid = its_cmd_get_validbit(its_cmd); + coll_id = its_cmd_get_collection(its_cmd); + target_addr = its_cmd_get_target_addr(its_cmd); + + if (target_addr >= atomic_read(&kvm->online_vcpus)) + return E_ITS_MAPC_PROCNUM_OOR; + + if (!valid) { + vgic_its_free_collection(its, coll_id); + vgic_its_invalidate_cache(kvm); + } else { + collection = find_collection(its, coll_id); + + if (!collection) { + int ret; + + ret = vgic_its_alloc_collection(its, &collection, + coll_id); + if (ret) + return ret; + collection->target_addr = target_addr; + } else { + collection->target_addr = target_addr; + update_affinity_collection(kvm, its, collection); + } + } + + return 0; +} + +/* + * The CLEAR command removes the pending state for a particular LPI. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_ite *ite; + + + ite = find_ite(its, device_id, event_id); + if (!ite) + return E_ITS_CLEAR_UNMAPPED_INTERRUPT; + + ite->irq->pending_latch = false; + + if (ite->irq->hw) + return irq_set_irqchip_state(ite->irq->host_irq, + IRQCHIP_STATE_PENDING, false); + + return 0; +} + +/* + * The INV command syncs the configuration bits from the memory table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_ite *ite; + + + ite = find_ite(its, device_id, event_id); + if (!ite) + return E_ITS_INV_UNMAPPED_INTERRUPT; + + return update_lpi_config(kvm, ite->irq, NULL, true); +} + +/* + * The INVALL command requests flushing of all IRQ data in this collection. + * Find the VCPU mapped to that collection, then iterate over the VM's list + * of mapped LPIs and update the configuration for each IRQ which targets + * the specified vcpu. The configuration will be read from the in-memory + * configuration table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_collection *collection; + struct kvm_vcpu *vcpu; + struct vgic_irq *irq; + u32 *intids; + int irq_count, i; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_INVALL_UNMAPPED_COLLECTION; + + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); + if (irq_count < 0) + return irq_count; + + for (i = 0; i < irq_count; i++) { + irq = vgic_get_irq(kvm, NULL, intids[i]); + if (!irq) + continue; + update_lpi_config(kvm, irq, vcpu, false); + vgic_put_irq(kvm, irq); + } + + kfree(intids); + + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) + its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); + + return 0; +} + +/* + * The MOVALL command moves the pending state of all IRQs targeting one + * redistributor to another. We don't hold the pending state in the VCPUs, + * but in the IRQs instead, so there is really not much to do for us here. + * However the spec says that no IRQ must target the old redistributor + * afterwards, so we make sure that no LPI is using the associated target_vcpu. + * This command affects all LPIs in the system that target that redistributor. + */ +static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 target1_addr = its_cmd_get_target_addr(its_cmd); + u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32); + struct kvm_vcpu *vcpu1, *vcpu2; + struct vgic_irq *irq; + u32 *intids; + int irq_count, i; + + if (target1_addr >= atomic_read(&kvm->online_vcpus) || + target2_addr >= atomic_read(&kvm->online_vcpus)) + return E_ITS_MOVALL_PROCNUM_OOR; + + if (target1_addr == target2_addr) + return 0; + + vcpu1 = kvm_get_vcpu(kvm, target1_addr); + vcpu2 = kvm_get_vcpu(kvm, target2_addr); + + irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids); + if (irq_count < 0) + return irq_count; + + for (i = 0; i < irq_count; i++) { + irq = vgic_get_irq(kvm, NULL, intids[i]); + + update_affinity(irq, vcpu2); + + vgic_put_irq(kvm, irq); + } + + vgic_its_invalidate_cache(kvm); + + kfree(intids); + return 0; +} + +/* + * The INT command injects the LPI associated with that DevID/EvID pair. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 msi_data = its_cmd_get_id(its_cmd); + u64 msi_devid = its_cmd_get_deviceid(its_cmd); + + return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); +} + +/* + * This function is called with the its_cmd lock held, but the ITS data + * structure lock dropped. + */ +static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + int ret = -ENODEV; + + mutex_lock(&its->its_lock); + switch (its_cmd_get_command(its_cmd)) { + case GITS_CMD_MAPD: + ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); + break; + case GITS_CMD_MAPC: + ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); + break; + case GITS_CMD_MAPI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); + break; + case GITS_CMD_MAPTI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); + break; + case GITS_CMD_MOVI: + ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); + break; + case GITS_CMD_DISCARD: + ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd); + break; + case GITS_CMD_CLEAR: + ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd); + break; + case GITS_CMD_MOVALL: + ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); + break; + case GITS_CMD_INT: + ret = vgic_its_cmd_handle_int(kvm, its, its_cmd); + break; + case GITS_CMD_INV: + ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); + break; + case GITS_CMD_INVALL: + ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd); + break; + case GITS_CMD_SYNC: + /* we ignore this command: we are in sync all of the time */ + ret = 0; + break; + } + mutex_unlock(&its->its_lock); + + return ret; +} + +static u64 vgic_sanitise_its_baser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK, + GITS_BASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK, + GITS_BASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK, + GITS_BASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* We support only one (ITS) page size: 64K */ + reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; + + return reg; +} + +static u64 vgic_sanitise_its_cbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK, + GITS_CBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK, + GITS_CBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK, + GITS_CBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* Sanitise the physical address to be 64k aligned. */ + reg &= ~GENMASK_ULL(15, 12); + + return reg; +} + +static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cbaser, addr & 7, len); +} + +static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + /* When GITS_CTLR.Enable is 1, this register is RO. */ + if (its->enabled) + return; + + mutex_lock(&its->cmd_lock); + its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val); + its->cbaser = vgic_sanitise_its_cbaser(its->cbaser); + its->creadr = 0; + /* + * CWRITER is architecturally UNKNOWN on reset, but we need to reset + * it to CREADR to make sure we start with an empty command buffer. + */ + its->cwriter = its->creadr; + mutex_unlock(&its->cmd_lock); +} + +#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12) +#define ITS_CMD_SIZE 32 +#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) + +/* Must be called with the cmd_lock held. */ +static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) +{ + gpa_t cbaser; + u64 cmd_buf[4]; + + /* Commands are only processed when the ITS is enabled. */ + if (!its->enabled) + return; + + cbaser = GITS_CBASER_ADDRESS(its->cbaser); + + while (its->cwriter != its->creadr) { + int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, + cmd_buf, ITS_CMD_SIZE); + /* + * If kvm_read_guest() fails, this could be due to the guest + * programming a bogus value in CBASER or something else going + * wrong from which we cannot easily recover. + * According to section 6.3.2 in the GICv3 spec we can just + * ignore that command then. + */ + if (!ret) + vgic_its_handle_command(kvm, its, cmd_buf); + + its->creadr += ITS_CMD_SIZE; + if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) + its->creadr = 0; + } +} + +/* + * By writing to CWRITER the guest announces new commands to be processed. + * To avoid any races in the first place, we take the its_cmd lock, which + * protects our ring buffer variables, so that there is only one user + * per ITS handling commands at a given time. + */ +static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u64 reg; + + if (!its) + return; + + mutex_lock(&its->cmd_lock); + + reg = update_64bit_reg(its->cwriter, addr & 7, len, val); + reg = ITS_CMD_OFFSET(reg); + if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { + mutex_unlock(&its->cmd_lock); + return; + } + its->cwriter = reg; + + vgic_its_process_commands(kvm, its); + + mutex_unlock(&its->cmd_lock); +} + +static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cwriter, addr & 0x7, len); +} + +static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->creadr, addr & 0x7, len); +} + +static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 cmd_offset; + int ret = 0; + + mutex_lock(&its->cmd_lock); + + if (its->enabled) { + ret = -EBUSY; + goto out; + } + + cmd_offset = ITS_CMD_OFFSET(val); + if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { + ret = -EINVAL; + goto out; + } + + its->creadr = cmd_offset; +out: + mutex_unlock(&its->cmd_lock); + return ret; +} + +#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7) +static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u64 reg; + + switch (BASER_INDEX(addr)) { + case 0: + reg = its->baser_device_table; + break; + case 1: + reg = its->baser_coll_table; + break; + default: + reg = 0; + break; + } + + return extract_bytes(reg, addr & 7, len); +} + +#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) +static void vgic_mmio_write_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 entry_size, table_type; + u64 reg, *regptr, clearbits = 0; + + /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ + if (its->enabled) + return; + + switch (BASER_INDEX(addr)) { + case 0: + regptr = &its->baser_device_table; + entry_size = abi->dte_esz; + table_type = GITS_BASER_TYPE_DEVICE; + break; + case 1: + regptr = &its->baser_coll_table; + entry_size = abi->cte_esz; + table_type = GITS_BASER_TYPE_COLLECTION; + clearbits = GITS_BASER_INDIRECT; + break; + default: + return; + } + + reg = update_64bit_reg(*regptr, addr & 7, len, val); + reg &= ~GITS_BASER_RO_MASK; + reg &= ~clearbits; + + reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; + reg |= table_type << GITS_BASER_TYPE_SHIFT; + reg = vgic_sanitise_its_baser(reg); + + *regptr = reg; + + if (!(reg & GITS_BASER_VALID)) { + /* Take the its_lock to prevent a race with a save/restore */ + mutex_lock(&its->its_lock); + switch (table_type) { + case GITS_BASER_TYPE_DEVICE: + vgic_its_free_device_list(kvm, its); + break; + case GITS_BASER_TYPE_COLLECTION: + vgic_its_free_collection_list(kvm, its); + break; + } + mutex_unlock(&its->its_lock); + } +} + +static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u32 reg = 0; + + mutex_lock(&its->cmd_lock); + if (its->creadr == its->cwriter) + reg |= GITS_CTLR_QUIESCENT; + if (its->enabled) + reg |= GITS_CTLR_ENABLE; + mutex_unlock(&its->cmd_lock); + + return reg; +} + +static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + mutex_lock(&its->cmd_lock); + + /* + * It is UNPREDICTABLE to enable the ITS if any of the CBASER or + * device/collection BASER are invalid + */ + if (!its->enabled && (val & GITS_CTLR_ENABLE) && + (!(its->baser_device_table & GITS_BASER_VALID) || + !(its->baser_coll_table & GITS_BASER_VALID) || + !(its->cbaser & GITS_CBASER_VALID))) + goto out; + + its->enabled = !!(val & GITS_CTLR_ENABLE); + if (!its->enabled) + vgic_its_invalidate_cache(kvm); + + /* + * Try to process any pending commands. This function bails out early + * if the ITS is disabled or no commands have been queued. + */ + vgic_its_process_commands(kvm, its); + +out: + mutex_unlock(&its->cmd_lock); +} + +#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ +{ \ + .reg_offset = off, \ + .len = length, \ + .access_flags = acc, \ + .its_read = rd, \ + .its_write = wr, \ +} + +#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\ +{ \ + .reg_offset = off, \ + .len = length, \ + .access_flags = acc, \ + .its_read = rd, \ + .its_write = wr, \ + .uaccess_its_write = uwr, \ +} + +static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, unsigned long val) +{ + /* Ignore */ +} + +static struct vgic_register_region its_registers[] = { + REGISTER_ITS_DESC(GITS_CTLR, + vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC_UACCESS(GITS_IIDR, + vgic_mmio_read_its_iidr, its_mmio_write_wi, + vgic_mmio_uaccess_write_its_iidr, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_TYPER, + vgic_mmio_read_its_typer, its_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CBASER, + vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CWRITER, + vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC_UACCESS(GITS_CREADR, + vgic_mmio_read_its_creadr, its_mmio_write_wi, + vgic_mmio_uaccess_write_its_creadr, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_BASER, + vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_IDREGS_BASE, + vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30, + VGIC_ACCESS_32bit), +}; + +/* This is called on setting the LPI enable bit in the redistributor. */ +void vgic_enable_lpis(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ)) + its_sync_lpi_pending_table(vcpu); +} + +static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its, + u64 addr) +{ + struct vgic_io_device *iodev = &its->iodev; + int ret; + + mutex_lock(&kvm->slots_lock); + if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { + ret = -EBUSY; + goto out; + } + + its->vgic_its_base = addr; + iodev->regions = its_registers; + iodev->nr_regions = ARRAY_SIZE(its_registers); + kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops); + + iodev->base_addr = its->vgic_its_base; + iodev->iodev_type = IODEV_ITS; + iodev->its = its; + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr, + KVM_VGIC_V3_ITS_SIZE, &iodev->dev); +out: + mutex_unlock(&kvm->slots_lock); + + return ret; +} + +/* Default is 16 cached LPIs per vcpu */ +#define LPI_DEFAULT_PCPU_CACHE_SIZE 16 + +void vgic_lpi_translation_cache_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned int sz; + int i; + + if (!list_empty(&dist->lpi_translation_cache)) + return; + + sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE; + + for (i = 0; i < sz; i++) { + struct vgic_translation_cache_entry *cte; + + /* An allocation failure is not fatal */ + cte = kzalloc(sizeof(*cte), GFP_KERNEL); + if (WARN_ON(!cte)) + break; + + INIT_LIST_HEAD(&cte->entry); + list_add(&cte->entry, &dist->lpi_translation_cache); + } +} + +void vgic_lpi_translation_cache_destroy(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_translation_cache_entry *cte, *tmp; + + vgic_its_invalidate_cache(kvm); + + list_for_each_entry_safe(cte, tmp, + &dist->lpi_translation_cache, entry) { + list_del(&cte->entry); + kfree(cte); + } +} + +#define INITIAL_BASER_VALUE \ + (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \ + GITS_BASER_PAGE_SIZE_64K) + +#define INITIAL_PROPBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)) + +static int vgic_its_create(struct kvm_device *dev, u32 type) +{ + struct vgic_its *its; + + if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) + return -ENODEV; + + its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); + if (!its) + return -ENOMEM; + + if (vgic_initialized(dev->kvm)) { + int ret = vgic_v4_init(dev->kvm); + if (ret < 0) { + kfree(its); + return ret; + } + + vgic_lpi_translation_cache_init(dev->kvm); + } + + mutex_init(&its->its_lock); + mutex_init(&its->cmd_lock); + + its->vgic_its_base = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&its->device_list); + INIT_LIST_HEAD(&its->collection_list); + + dev->kvm->arch.vgic.msis_require_devid = true; + dev->kvm->arch.vgic.has_its = true; + its->enabled = false; + its->dev = dev; + + its->baser_device_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); + its->baser_coll_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); + dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; + + dev->private = its; + + return vgic_its_set_abi(its, NR_ITS_ABIS - 1); +} + +static void vgic_its_destroy(struct kvm_device *kvm_dev) +{ + struct kvm *kvm = kvm_dev->kvm; + struct vgic_its *its = kvm_dev->private; + + mutex_lock(&its->its_lock); + + vgic_its_free_device_list(kvm, its); + vgic_its_free_collection_list(kvm, its); + + mutex_unlock(&its->its_lock); + kfree(its); + kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */ +} + +static int vgic_its_has_attr_regs(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + gpa_t offset = attr->attr; + int align; + + align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7; + + if (offset & align) + return -EINVAL; + + region = vgic_find_mmio_region(its_registers, + ARRAY_SIZE(its_registers), + offset); + if (!region) + return -ENXIO; + + return 0; +} + +static int vgic_its_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u64 *reg, bool is_write) +{ + const struct vgic_register_region *region; + struct vgic_its *its; + gpa_t addr, offset; + unsigned int len; + int align, ret = 0; + + its = dev->private; + offset = attr->attr; + + /* + * Although the spec supports upper/lower 32-bit accesses to + * 64-bit ITS registers, the userspace ABI requires 64-bit + * accesses to all 64-bit wide registers. We therefore only + * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID + * registers + */ + if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4)) + align = 0x3; + else + align = 0x7; + + if (offset & align) + return -EINVAL; + + mutex_lock(&dev->kvm->lock); + + if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { + ret = -ENXIO; + goto out; + } + + region = vgic_find_mmio_region(its_registers, + ARRAY_SIZE(its_registers), + offset); + if (!region) { + ret = -ENXIO; + goto out; + } + + if (!lock_all_vcpus(dev->kvm)) { + ret = -EBUSY; + goto out; + } + + addr = its->vgic_its_base + offset; + + len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4; + + if (is_write) { + if (region->uaccess_its_write) + ret = region->uaccess_its_write(dev->kvm, its, addr, + len, *reg); + else + region->its_write(dev->kvm, its, addr, len, *reg); + } else { + *reg = region->its_read(dev->kvm, its, addr, len); + } + unlock_all_vcpus(dev->kvm); +out: + mutex_unlock(&dev->kvm->lock); + return ret; +} + +static u32 compute_next_devid_offset(struct list_head *h, + struct its_device *dev) +{ + struct its_device *next; + u32 next_offset; + + if (list_is_last(&dev->dev_list, h)) + return 0; + next = list_next_entry(dev, dev_list); + next_offset = next->device_id - dev->device_id; + + return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET); +} + +static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite) +{ + struct its_ite *next; + u32 next_offset; + + if (list_is_last(&ite->ite_list, h)) + return 0; + next = list_next_entry(ite, ite_list); + next_offset = next->event_id - ite->event_id; + + return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET); +} + +/** + * entry_fn_t - Callback called on a table entry restore path + * @its: its handle + * @id: id of the entry + * @entry: pointer to the entry + * @opaque: pointer to an opaque data + * + * Return: < 0 on error, 0 if last element was identified, id offset to next + * element otherwise + */ +typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, + void *opaque); + +/** + * scan_its_table - Scan a contiguous table in guest RAM and applies a function + * to each entry + * + * @its: its handle + * @base: base gpa of the table + * @size: size of the table in bytes + * @esz: entry size in bytes + * @start_id: the ID of the first entry in the table + * (non zero for 2d level tables) + * @fn: function to apply on each entry + * + * Return: < 0 on error, 0 if last element was identified, 1 otherwise + * (the last element may not be found on second level tables) + */ +static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, + int start_id, entry_fn_t fn, void *opaque) +{ + struct kvm *kvm = its->dev->kvm; + unsigned long len = size; + int id = start_id; + gpa_t gpa = base; + char entry[ESZ_MAX]; + int ret; + + memset(entry, 0, esz); + + while (len > 0) { + int next_offset; + size_t byte_offset; + + ret = kvm_read_guest_lock(kvm, gpa, entry, esz); + if (ret) + return ret; + + next_offset = fn(its, id, entry, opaque); + if (next_offset <= 0) + return next_offset; + + byte_offset = next_offset * esz; + id += next_offset; + gpa += byte_offset; + len -= byte_offset; + } + return 1; +} + +/** + * vgic_its_save_ite - Save an interrupt translation entry at @gpa + */ +static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, + struct its_ite *ite, gpa_t gpa, int ite_esz) +{ + struct kvm *kvm = its->dev->kvm; + u32 next_offset; + u64 val; + + next_offset = compute_next_eventid_offset(&dev->itt_head, ite); + val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) | + ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | + ite->collection->collection_id; + val = cpu_to_le64(val); + return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); +} + +/** + * vgic_its_restore_ite - restore an interrupt translation entry + * @event_id: id used for indexing + * @ptr: pointer to the ITE entry + * @opaque: pointer to the its_device + */ +static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, + void *ptr, void *opaque) +{ + struct its_device *dev = (struct its_device *)opaque; + struct its_collection *collection; + struct kvm *kvm = its->dev->kvm; + struct kvm_vcpu *vcpu = NULL; + u64 val; + u64 *p = (u64 *)ptr; + struct vgic_irq *irq; + u32 coll_id, lpi_id; + struct its_ite *ite; + u32 offset; + + val = *p; + + val = le64_to_cpu(val); + + coll_id = val & KVM_ITS_ITE_ICID_MASK; + lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT; + + if (!lpi_id) + return 1; /* invalid entry, no choice but to scan next entry */ + + if (lpi_id < VGIC_MIN_LPI) + return -EINVAL; + + offset = val >> KVM_ITS_ITE_NEXT_SHIFT; + if (event_id + offset >= BIT_ULL(dev->num_eventid_bits)) + return -EINVAL; + + collection = find_collection(its, coll_id); + if (!collection) + return -EINVAL; + + ite = vgic_its_alloc_ite(dev, collection, event_id); + if (IS_ERR(ite)) + return PTR_ERR(ite); + + if (its_is_collection_mapped(collection)) + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + irq = vgic_add_lpi(kvm, lpi_id, vcpu); + if (IS_ERR(irq)) + return PTR_ERR(irq); + ite->irq = irq; + + return offset; +} + +static int vgic_its_ite_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct its_ite *itea = container_of(a, struct its_ite, ite_list); + struct its_ite *iteb = container_of(b, struct its_ite, ite_list); + + if (itea->event_id < iteb->event_id) + return -1; + else + return 1; +} + +static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + gpa_t base = device->itt_addr; + struct its_ite *ite; + int ret; + int ite_esz = abi->ite_esz; + + list_sort(NULL, &device->itt_head, vgic_its_ite_cmp); + + list_for_each_entry(ite, &device->itt_head, ite_list) { + gpa_t gpa = base + ite->event_id * ite_esz; + + /* + * If an LPI carries the HW bit, this means that this + * interrupt is controlled by GICv4, and we do not + * have direct access to that state. Let's simply fail + * the save operation... + */ + if (ite->irq->hw) + return -EACCES; + + ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz); + if (ret) + return ret; + } + return 0; +} + +/** + * vgic_its_restore_itt - restore the ITT of a device + * + * @its: its handle + * @dev: device handle + * + * Return 0 on success, < 0 on error + */ +static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + gpa_t base = dev->itt_addr; + int ret; + int ite_esz = abi->ite_esz; + size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz; + + ret = scan_its_table(its, base, max_size, ite_esz, 0, + vgic_its_restore_ite, dev); + + /* scan_its_table returns +1 if all ITEs are invalid */ + if (ret > 0) + ret = 0; + + return ret; +} + +/** + * vgic_its_save_dte - Save a device table entry at a given GPA + * + * @its: ITS handle + * @dev: ITS device + * @ptr: GPA + */ +static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, + gpa_t ptr, int dte_esz) +{ + struct kvm *kvm = its->dev->kvm; + u64 val, itt_addr_field; + u32 next_offset; + + itt_addr_field = dev->itt_addr >> 8; + next_offset = compute_next_devid_offset(&its->device_list, dev); + val = (1ULL << KVM_ITS_DTE_VALID_SHIFT | + ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) | + (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | + (dev->num_eventid_bits - 1)); + val = cpu_to_le64(val); + return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); +} + +/** + * vgic_its_restore_dte - restore a device table entry + * + * @its: its handle + * @id: device id the DTE corresponds to + * @ptr: kernel VA where the 8 byte DTE is located + * @opaque: unused + * + * Return: < 0 on error, 0 if the dte is the last one, id offset to the + * next dte otherwise + */ +static int vgic_its_restore_dte(struct vgic_its *its, u32 id, + void *ptr, void *opaque) +{ + struct its_device *dev; + gpa_t itt_addr; + u8 num_eventid_bits; + u64 entry = *(u64 *)ptr; + bool valid; + u32 offset; + int ret; + + entry = le64_to_cpu(entry); + + valid = entry >> KVM_ITS_DTE_VALID_SHIFT; + num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1; + itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK) + >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8; + + if (!valid) + return 1; + + /* dte entry is valid */ + offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT; + + dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + ret = vgic_its_restore_itt(its, dev); + if (ret) { + vgic_its_free_device(its->dev->kvm, dev); + return ret; + } + + return offset; +} + +static int vgic_its_device_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct its_device *deva = container_of(a, struct its_device, dev_list); + struct its_device *devb = container_of(b, struct its_device, dev_list); + + if (deva->device_id < devb->device_id) + return -1; + else + return 1; +} + +/** + * vgic_its_save_device_tables - Save the device table and all ITT + * into guest RAM + * + * L1/L2 handling is hidden by vgic_its_check_id() helper which directly + * returns the GPA of the device entry + */ +static int vgic_its_save_device_tables(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_device_table; + struct its_device *dev; + int dte_esz = abi->dte_esz; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + list_sort(NULL, &its->device_list, vgic_its_device_cmp); + + list_for_each_entry(dev, &its->device_list, dev_list) { + int ret; + gpa_t eaddr; + + if (!vgic_its_check_id(its, baser, + dev->device_id, &eaddr)) + return -EINVAL; + + ret = vgic_its_save_itt(its, dev); + if (ret) + return ret; + + ret = vgic_its_save_dte(its, dev, eaddr, dte_esz); + if (ret) + return ret; + } + return 0; +} + +/** + * handle_l1_dte - callback used for L1 device table entries (2 stage case) + * + * @its: its handle + * @id: index of the entry in the L1 table + * @addr: kernel VA + * @opaque: unused + * + * L1 table entries are scanned by steps of 1 entry + * Return < 0 if error, 0 if last dte was found when scanning the L2 + * table, +1 otherwise (meaning next L1 entry must be scanned) + */ +static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, + void *opaque) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int l2_start_id = id * (SZ_64K / abi->dte_esz); + u64 entry = *(u64 *)addr; + int dte_esz = abi->dte_esz; + gpa_t gpa; + int ret; + + entry = le64_to_cpu(entry); + + if (!(entry & KVM_ITS_L1E_VALID_MASK)) + return 1; + + gpa = entry & KVM_ITS_L1E_ADDR_MASK; + + ret = scan_its_table(its, gpa, SZ_64K, dte_esz, + l2_start_id, vgic_its_restore_dte, NULL); + + return ret; +} + +/** + * vgic_its_restore_device_tables - Restore the device table and all ITT + * from guest RAM to internal data structs + */ +static int vgic_its_restore_device_tables(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_device_table; + int l1_esz, ret; + int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + gpa_t l1_gpa; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + l1_gpa = GITS_BASER_ADDR_48_to_52(baser); + + if (baser & GITS_BASER_INDIRECT) { + l1_esz = GITS_LVL1_ENTRY_SIZE; + ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, + handle_l1_dte, NULL); + } else { + l1_esz = abi->dte_esz; + ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, + vgic_its_restore_dte, NULL); + } + + /* scan_its_table returns +1 if all entries are invalid */ + if (ret > 0) + ret = 0; + + return ret; +} + +static int vgic_its_save_cte(struct vgic_its *its, + struct its_collection *collection, + gpa_t gpa, int esz) +{ + u64 val; + + val = (1ULL << KVM_ITS_CTE_VALID_SHIFT | + ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | + collection->collection_id); + val = cpu_to_le64(val); + return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); +} + +static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) +{ + struct its_collection *collection; + struct kvm *kvm = its->dev->kvm; + u32 target_addr, coll_id; + u64 val; + int ret; + + BUG_ON(esz > sizeof(val)); + ret = kvm_read_guest_lock(kvm, gpa, &val, esz); + if (ret) + return ret; + val = le64_to_cpu(val); + if (!(val & KVM_ITS_CTE_VALID_MASK)) + return 0; + + target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT); + coll_id = val & KVM_ITS_CTE_ICID_MASK; + + if (target_addr != COLLECTION_NOT_MAPPED && + target_addr >= atomic_read(&kvm->online_vcpus)) + return -EINVAL; + + collection = find_collection(its, coll_id); + if (collection) + return -EEXIST; + ret = vgic_its_alloc_collection(its, &collection, coll_id); + if (ret) + return ret; + collection->target_addr = target_addr; + return 1; +} + +/** + * vgic_its_save_collection_table - Save the collection table into + * guest RAM + */ +static int vgic_its_save_collection_table(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_coll_table; + gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); + struct its_collection *collection; + u64 val; + size_t max_size, filled = 0; + int ret, cte_esz = abi->cte_esz; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + + list_for_each_entry(collection, &its->collection_list, coll_list) { + ret = vgic_its_save_cte(its, collection, gpa, cte_esz); + if (ret) + return ret; + gpa += cte_esz; + filled += cte_esz; + } + + if (filled == max_size) + return 0; + + /* + * table is not fully filled, add a last dummy element + * with valid bit unset + */ + val = 0; + BUG_ON(cte_esz > sizeof(val)); + ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); + return ret; +} + +/** + * vgic_its_restore_collection_table - reads the collection table + * in guest memory and restores the ITS internal state. Requires the + * BASER registers to be restored before. + */ +static int vgic_its_restore_collection_table(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_coll_table; + int cte_esz = abi->cte_esz; + size_t max_size, read = 0; + gpa_t gpa; + int ret; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + gpa = GITS_BASER_ADDR_48_to_52(baser); + + max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + + while (read < max_size) { + ret = vgic_its_restore_cte(its, gpa, cte_esz); + if (ret <= 0) + break; + gpa += cte_esz; + read += cte_esz; + } + + if (ret > 0) + return 0; + + return ret; +} + +/** + * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM + * according to v0 ABI + */ +static int vgic_its_save_tables_v0(struct vgic_its *its) +{ + int ret; + + ret = vgic_its_save_device_tables(its); + if (ret) + return ret; + + return vgic_its_save_collection_table(its); +} + +/** + * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM + * to internal data structs according to V0 ABI + * + */ +static int vgic_its_restore_tables_v0(struct vgic_its *its) +{ + int ret; + + ret = vgic_its_restore_collection_table(its); + if (ret) + return ret; + + return vgic_its_restore_device_tables(its); +} + +static int vgic_its_commit_v0(struct vgic_its *its) +{ + const struct vgic_its_abi *abi; + + abi = vgic_its_get_abi(its); + its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK; + its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK; + + its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5) + << GITS_BASER_ENTRY_SIZE_SHIFT); + + its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5) + << GITS_BASER_ENTRY_SIZE_SHIFT); + return 0; +} + +static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its) +{ + /* We need to keep the ABI specific field values */ + its->baser_coll_table &= ~GITS_BASER_VALID; + its->baser_device_table &= ~GITS_BASER_VALID; + its->cbaser = 0; + its->creadr = 0; + its->cwriter = 0; + its->enabled = 0; + vgic_its_free_device_list(kvm, its); + vgic_its_free_collection_list(kvm, its); +} + +static int vgic_its_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_ITS_ADDR_TYPE: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + case KVM_DEV_ARM_ITS_CTRL_RESET: + return 0; + case KVM_DEV_ARM_ITS_SAVE_TABLES: + return 0; + case KVM_DEV_ARM_ITS_RESTORE_TABLES: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: + return vgic_its_has_attr_regs(dev, attr); + } + return -ENXIO; +} + +static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int ret = 0; + + if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ + return 0; + + mutex_lock(&kvm->lock); + mutex_lock(&its->its_lock); + + if (!lock_all_vcpus(kvm)) { + mutex_unlock(&its->its_lock); + mutex_unlock(&kvm->lock); + return -EBUSY; + } + + switch (attr) { + case KVM_DEV_ARM_ITS_CTRL_RESET: + vgic_its_reset(kvm, its); + break; + case KVM_DEV_ARM_ITS_SAVE_TABLES: + ret = abi->save_tables(its); + break; + case KVM_DEV_ARM_ITS_RESTORE_TABLES: + ret = abi->restore_tables(its); + break; + } + + unlock_all_vcpus(kvm); + mutex_unlock(&its->its_lock); + mutex_unlock(&kvm->lock); + return ret; +} + +static int vgic_its_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct vgic_its *its = dev->private; + int ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + u64 addr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, + addr, SZ_64K); + if (ret) + return ret; + + return vgic_register_its_iodev(dev->kvm, its, addr); + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + return vgic_its_ctrl(dev->kvm, its, attr->attr); + case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_its_attr_regs_access(dev, attr, ®, true); + } + } + return -ENXIO; +} + +static int vgic_its_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + struct vgic_its *its = dev->private; + u64 addr = its->vgic_its_base; + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + break; + } + case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + int ret; + + ret = vgic_its_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + return put_user(reg, uaddr); + } + default: + return -ENXIO; + } + + return 0; +} + +static struct kvm_device_ops kvm_arm_vgic_its_ops = { + .name = "kvm-arm-vgic-its", + .create = vgic_its_create, + .destroy = vgic_its_destroy, + .set_attr = vgic_its_set_attr, + .get_attr = vgic_its_get_attr, + .has_attr = vgic_its_has_attr, +}; + +int kvm_vgic_register_its_device(void) +{ + return kvm_register_device_ops(&kvm_arm_vgic_its_ops, + KVM_DEV_TYPE_ARM_VGIC_ITS); +} diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c new file mode 100644 index 000000000000..44419679f91a --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -0,0 +1,741 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGIC: KVM DEVICE API + * + * Copyright (C) 2015 ARM Ltd. + * Author: Marc Zyngier + */ +#include +#include +#include +#include +#include +#include "vgic.h" + +/* common helpers */ + +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment) +{ + if (addr & ~kvm_phys_mask(kvm)) + return -E2BIG; + + if (!IS_ALIGNED(addr, alignment)) + return -EINVAL; + + if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) + return -EEXIST; + + return 0; +} + +static int vgic_check_type(struct kvm *kvm, int type_needed) +{ + if (kvm->arch.vgic.vgic_model != type_needed) + return -ENODEV; + else + return 0; +} + +/** + * kvm_vgic_addr - set or get vgic VM base addresses + * @kvm: pointer to the vm struct + * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX + * @addr: pointer to address value + * @write: if true set the address in the VM address space, if false read the + * address + * + * Set or get the vgic base addresses for the distributor and the virtual CPU + * interface in the VM physical address space. These addresses are properties + * of the emulated core/SoC and therefore user space initially knows this + * information. + * Check them for sanity (alignment, double assignment). We can't check for + * overlapping regions in case of a virtual GICv3 here, since we don't know + * the number of VCPUs yet, so we defer this check to map_resources(). + */ +int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) +{ + int r = 0; + struct vgic_dist *vgic = &kvm->arch.vgic; + phys_addr_t *addr_ptr, alignment; + u64 undef_value = VGIC_ADDR_UNDEF; + + mutex_lock(&kvm->lock); + switch (type) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + addr_ptr = &vgic->vgic_dist_base; + alignment = SZ_4K; + break; + case KVM_VGIC_V2_ADDR_TYPE_CPU: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + addr_ptr = &vgic->vgic_cpu_base; + alignment = SZ_4K; + break; + case KVM_VGIC_V3_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); + addr_ptr = &vgic->vgic_dist_base; + alignment = SZ_64K; + break; + case KVM_VGIC_V3_ADDR_TYPE_REDIST: { + struct vgic_redist_region *rdreg; + + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); + if (r) + break; + if (write) { + r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); + goto out; + } + rdreg = list_first_entry(&vgic->rd_regions, + struct vgic_redist_region, list); + if (!rdreg) + addr_ptr = &undef_value; + else + addr_ptr = &rdreg->base; + break; + } + case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: + { + struct vgic_redist_region *rdreg; + u8 index; + + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); + if (r) + break; + + index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK; + + if (write) { + gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK; + u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK) + >> KVM_VGIC_V3_RDIST_COUNT_SHIFT; + u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK) + >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT; + + if (!count || flags) + r = -EINVAL; + else + r = vgic_v3_set_redist_base(kvm, index, + base, count); + goto out; + } + + rdreg = vgic_v3_rdist_region_from_index(kvm, index); + if (!rdreg) { + r = -ENOENT; + goto out; + } + + *addr = index; + *addr |= rdreg->base; + *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; + goto out; + } + default: + r = -ENODEV; + } + + if (r) + goto out; + + if (write) { + r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment); + if (!r) + *addr_ptr = *addr; + } else { + *addr = *addr_ptr; + } + +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int vgic_set_common_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int r; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 addr; + unsigned long type = (unsigned long)attr->attr; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + r = kvm_vgic_addr(dev->kvm, type, &addr, true); + return (r == -ENODEV) ? -ENXIO : r; + } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 val; + int ret = 0; + + if (get_user(val, uaddr)) + return -EFAULT; + + /* + * We require: + * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs + * - at most 1024 interrupts + * - a multiple of 32 interrupts + */ + if (val < (VGIC_NR_PRIVATE_IRQS + 32) || + val > VGIC_MAX_RESERVED || + (val & 31)) + return -EINVAL; + + mutex_lock(&dev->kvm->lock); + + if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) + ret = -EBUSY; + else + dev->kvm->arch.vgic.nr_spis = + val - VGIC_NR_PRIVATE_IRQS; + + mutex_unlock(&dev->kvm->lock); + + return ret; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: { + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + mutex_lock(&dev->kvm->lock); + r = vgic_init(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return r; + } + break; + } + } + + return -ENXIO; +} + +static int vgic_get_common_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int r = -ENXIO; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 addr; + unsigned long type = (unsigned long)attr->attr; + + r = kvm_vgic_addr(dev->kvm, type, &addr, false); + if (r) + return (r == -ENODEV) ? -ENXIO : r; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + break; + } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + + r = put_user(dev->kvm->arch.vgic.nr_spis + + VGIC_NR_PRIVATE_IRQS, uaddr); + break; + } + } + + return r; +} + +static int vgic_create(struct kvm_device *dev, u32 type) +{ + return kvm_vgic_create(dev->kvm, type); +} + +static void vgic_destroy(struct kvm_device *dev) +{ + kfree(dev); +} + +int kvm_register_vgic_device(unsigned long type) +{ + int ret = -ENODEV; + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V2: + ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, + KVM_DEV_TYPE_ARM_VGIC_V2); + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: + ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, + KVM_DEV_TYPE_ARM_VGIC_V3); + + if (ret) + break; + ret = kvm_vgic_register_its_device(); + break; + } + + return ret; +} + +int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr) +{ + int cpuid; + + cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> + KVM_DEV_ARM_VGIC_CPUID_SHIFT; + + if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) + return -EINVAL; + + reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid); + reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + + return 0; +} + +/* unlocks vcpus from @vcpu_lock_idx and smaller */ +static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) +{ + struct kvm_vcpu *tmp_vcpu; + + for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { + tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); + mutex_unlock(&tmp_vcpu->mutex); + } +} + +void unlock_all_vcpus(struct kvm *kvm) +{ + unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); +} + +/* Returns true if all vcpus were locked, false otherwise */ +bool lock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *tmp_vcpu; + int c; + + /* + * Any time a vcpu is run, vcpu_load is called which tries to grab the + * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure + * that no other VCPUs are run and fiddle with the vgic state while we + * access it. + */ + kvm_for_each_vcpu(c, tmp_vcpu, kvm) { + if (!mutex_trylock(&tmp_vcpu->mutex)) { + unlock_vcpus(kvm, c - 1); + return false; + } + } + + return true; +} + +/** + * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state + * + * @dev: kvm device handle + * @attr: kvm device attribute + * @reg: address the value is read or written + * @is_write: true if userspace is writing a register + */ +static int vgic_v2_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u32 *reg, bool is_write) +{ + struct vgic_reg_attr reg_attr; + gpa_t addr; + struct kvm_vcpu *vcpu; + int ret; + + ret = vgic_v2_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + mutex_lock(&dev->kvm->lock); + + ret = vgic_init(dev->kvm); + if (ret) + goto out; + + if (!lock_all_vcpus(dev->kvm)) { + ret = -EBUSY; + goto out; + } + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg); + break; + default: + ret = -EINVAL; + break; + } + + unlock_all_vcpus(dev->kvm); +out: + mutex_unlock(&dev->kvm->lock); + return ret; +} + +static int vgic_v2_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_set_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_v2_attr_regs_access(dev, attr, ®, true); + } + } + + return -ENXIO; +} + +static int vgic_v2_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_get_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 reg = 0; + + ret = vgic_v2_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + return put_user(reg, uaddr); + } + } + + return -ENXIO; +} + +static int vgic_v2_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + case KVM_VGIC_V2_ADDR_TYPE_CPU: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_has_attr_regs(dev, attr); + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return 0; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + } + } + return -ENXIO; +} + +struct kvm_device_ops kvm_arm_vgic_v2_ops = { + .name = "kvm-arm-vgic-v2", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_v2_set_attr, + .get_attr = vgic_v2_get_attr, + .has_attr = vgic_v2_has_attr, +}; + +int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr) +{ + unsigned long vgic_mpidr, mpidr_reg; + + /* + * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group, + * attr might not hold MPIDR. Hence assume vcpu0. + */ + if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) { + vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >> + KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT; + + mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr); + reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg); + } else { + reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0); + } + + if (!reg_attr->vcpu) + return -EINVAL; + + reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + + return 0; +} + +/* + * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state + * + * @dev: kvm device handle + * @attr: kvm device attribute + * @reg: address the value is read or written + * @is_write: true if userspace is writing a register + */ +static int vgic_v3_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u64 *reg, bool is_write) +{ + struct vgic_reg_attr reg_attr; + gpa_t addr; + struct kvm_vcpu *vcpu; + int ret; + u32 tmp32; + + ret = vgic_v3_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + mutex_lock(&dev->kvm->lock); + + if (unlikely(!vgic_initialized(dev->kvm))) { + ret = -EBUSY; + goto out; + } + + if (!lock_all_vcpus(dev->kvm)) { + ret = -EBUSY; + goto out; + } + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + if (is_write) + tmp32 = *reg; + + ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32); + if (!is_write) + *reg = tmp32; + break; + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + if (is_write) + tmp32 = *reg; + + ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32); + if (!is_write) + *reg = tmp32; + break; + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 regid; + + regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); + ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write, + regid, reg); + break; + } + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + unsigned int info, intid; + + info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> + KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; + if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { + intid = attr->attr & + KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; + ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, + intid, reg); + } else { + ret = -EINVAL; + } + break; + } + default: + ret = -EINVAL; + break; + } + + unlock_all_vcpus(dev->kvm); +out: + mutex_unlock(&dev->kvm->lock); + return ret; +} + +static int vgic_v3_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_set_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 tmp32; + u64 reg; + + if (get_user(tmp32, uaddr)) + return -EFAULT; + + reg = tmp32; + return vgic_v3_attr_regs_access(dev, attr, ®, true); + } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_v3_attr_regs_access(dev, attr, ®, true); + } + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u64 reg; + u32 tmp32; + + if (get_user(tmp32, uaddr)) + return -EFAULT; + + reg = tmp32; + return vgic_v3_attr_regs_access(dev, attr, ®, true); + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: { + int ret; + + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: + mutex_lock(&dev->kvm->lock); + + if (!lock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + ret = vgic_v3_save_pending_tables(dev->kvm); + unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return ret; + } + break; + } + } + return -ENXIO; +} + +static int vgic_v3_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_get_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u64 reg; + u32 tmp32; + + ret = vgic_v3_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + tmp32 = reg; + return put_user(tmp32, uaddr); + } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + + ret = vgic_v3_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + return put_user(reg, uaddr); + } + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u64 reg; + u32 tmp32; + + ret = vgic_v3_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + tmp32 = reg; + return put_user(tmp32, uaddr); + } + } + return -ENXIO; +} + +static int vgic_v3_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_V3_ADDR_TYPE_DIST: + case KVM_VGIC_V3_ADDR_TYPE_REDIST: + case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + return vgic_v3_has_attr_regs(dev, attr); + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return 0; + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> + KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) == + VGIC_LEVEL_INFO_LINE_LEVEL) + return 0; + break; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: + return 0; + } + } + return -ENXIO; +} + +struct kvm_device_ops kvm_arm_vgic_v3_ops = { + .name = "kvm-arm-vgic-v3", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_v3_set_attr, + .get_attr = vgic_v3_get_attr, + .has_attr = vgic_v3_has_attr, +}; diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c new file mode 100644 index 000000000000..a016f07adc28 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGICv2 MMIO handling functions + */ + +#include +#include +#include +#include + +#include +#include + +#include "vgic.h" +#include "vgic-mmio.h" + +/* + * The Revision field in the IIDR have the following meanings: + * + * Revision 1: Report GICv2 interrupts as group 0 instead of group 1 + * Revision 2: Interrupt groups are guest-configurable and signaled using + * their configured groups. + */ + +static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + u32 value; + + switch (addr & 0x0c) { + case GIC_DIST_CTRL: + value = vgic->enabled ? GICD_ENABLE : 0; + break; + case GIC_DIST_CTR: + value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; + value = (value >> 5) - 1; + value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; + break; + case GIC_DIST_IIDR: + value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | + (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | + (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); + break; + default: + return 0; + } + + return value; +} + +static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + bool was_enabled = dist->enabled; + + switch (addr & 0x0c) { + case GIC_DIST_CTRL: + dist->enabled = val & GICD_ENABLE; + if (!was_enabled && dist->enabled) + vgic_kick_vcpus(vcpu->kvm); + break; + case GIC_DIST_CTR: + case GIC_DIST_IIDR: + /* Nothing to do */ + return; + } +} + +static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + switch (addr & 0x0c) { + case GIC_DIST_IIDR: + if (val != vgic_mmio_read_v2_misc(vcpu, addr, len)) + return -EINVAL; + + /* + * If we observe a write to GICD_IIDR we know that userspace + * has been updated and has had a chance to cope with older + * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting + * interrupts as group 1, and therefore we now allow groups to + * be user writable. Doing this by default would break + * migration from old kernels to new kernels with legacy + * userspace. + */ + vcpu->kvm->arch.vgic.v2_groups_user_writable = true; + return 0; + } + + vgic_mmio_write_v2_misc(vcpu, addr, len, val); + return 0; +} + +static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + if (vcpu->kvm->arch.vgic.v2_groups_user_writable) + vgic_mmio_write_group(vcpu, addr, len, val); + + return 0; +} + +static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus); + int intid = val & 0xf; + int targets = (val >> 16) & 0xff; + int mode = (val >> 24) & 0x03; + int c; + struct kvm_vcpu *vcpu; + unsigned long flags; + + switch (mode) { + case 0x0: /* as specified by targets */ + break; + case 0x1: + targets = (1U << nr_vcpus) - 1; /* all, ... */ + targets &= ~(1U << source_vcpu->vcpu_id); /* but self */ + break; + case 0x2: /* this very vCPU only */ + targets = (1U << source_vcpu->vcpu_id); + break; + case 0x3: /* reserved */ + return; + } + + kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) { + struct vgic_irq *irq; + + if (!(targets & (1U << c))) + continue; + + irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + irq->source |= 1U << source_vcpu->vcpu_id; + + vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags); + vgic_put_irq(source_vcpu->kvm, irq); + } +} + +static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + val |= (u64)irq->targets << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0); + int i; + unsigned long flags; + + /* GICD_ITARGETSR[0-7] are read-only */ + if (intid < VGIC_NR_PRIVATE_IRQS) + return; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); + int target; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + irq->targets = (val >> (i * 8)) & cpu_mask; + target = irq->targets ? __ffs(irq->targets) : 0; + irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = addr & 0x0f; + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + val |= (u64)irq->source << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); + } + return val; +} + +static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = addr & 0x0f; + int i; + unsigned long flags; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + irq->source &= ~((val >> (i * 8)) & 0xff); + if (!irq->source) + irq->pending_latch = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = addr & 0x0f; + int i; + unsigned long flags; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + irq->source |= (val >> (i * 8)) & 0xff; + + if (irq->source) { + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + vgic_put_irq(vcpu->kvm, irq); + } +} + +#define GICC_ARCH_VERSION_V2 0x2 + +/* These are for userland accesses only, there is no guest-facing emulation. */ +static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_vmcr vmcr; + u32 val; + + vgic_get_vmcr(vcpu, &vmcr); + + switch (addr & 0xff) { + case GIC_CPU_CTRL: + val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT; + val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT; + val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT; + val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT; + val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT; + val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT; + + break; + case GIC_CPU_PRIMASK: + /* + * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the + * the PMR field as GICH_VMCR.VMPriMask rather than + * GICC_PMR.Priority, so we expose the upper five bits of + * priority mask to userspace using the lower bits in the + * unsigned long. + */ + val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >> + GICV_PMR_PRIORITY_SHIFT; + break; + case GIC_CPU_BINPOINT: + val = vmcr.bpr; + break; + case GIC_CPU_ALIAS_BINPOINT: + val = vmcr.abpr; + break; + case GIC_CPU_IDENT: + val = ((PRODUCT_ID_KVM << 20) | + (GICC_ARCH_VERSION_V2 << 16) | + IMPLEMENTER_ARM); + break; + default: + return 0; + } + + return val; +} + +static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + + switch (addr & 0xff) { + case GIC_CPU_CTRL: + vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0); + vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1); + vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl); + vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn); + vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR); + vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS); + + break; + case GIC_CPU_PRIMASK: + /* + * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the + * the PMR field as GICH_VMCR.VMPriMask rather than + * GICC_PMR.Priority, so we expose the upper five bits of + * priority mask to userspace using the lower bits in the + * unsigned long. + */ + vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) & + GICV_PMR_PRIORITY_MASK; + break; + case GIC_CPU_BINPOINT: + vmcr.bpr = val; + break; + case GIC_CPU_ALIAS_BINPOINT: + vmcr.abpr = val; + break; + } + + vgic_set_vmcr(vcpu, &vmcr); +} + +static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + int n; /* which APRn is this */ + + n = (addr >> 2) & 0x3; + + if (kvm_vgic_global_state.type == VGIC_V2) { + /* GICv2 hardware systems support max. 32 groups */ + if (n != 0) + return 0; + return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr; + } else { + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (n > vgic_v3_max_apr_idx(vcpu)) + return 0; + + n = array_index_nospec(n, 4); + + /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ + return vgicv3->vgic_ap1r[n]; + } +} + +static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int n; /* which APRn is this */ + + n = (addr >> 2) & 0x3; + + if (kvm_vgic_global_state.type == VGIC_V2) { + /* GICv2 hardware systems support max. 32 groups */ + if (n != 0) + return; + vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val; + } else { + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (n > vgic_v3_max_apr_idx(vcpu)) + return; + + n = array_index_nospec(n, 4); + + /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ + vgicv3->vgic_ap1r[n] = val; + } +} + +static const struct vgic_register_region vgic_v2_dist_registers[] = { + REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL, + vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, + NULL, vgic_mmio_uaccess_write_v2_misc, + 12, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP, + vgic_mmio_read_group, vgic_mmio_write_group, + NULL, vgic_mmio_uaccess_write_v2_group, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, + vgic_mmio_read_pending, vgic_mmio_write_spending, + NULL, vgic_uaccess_write_spending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + NULL, vgic_uaccess_write_cpending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, + vgic_mmio_read_active, vgic_mmio_write_sactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR, + vgic_mmio_read_active, vgic_mmio_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI, + vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, + 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET, + vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG, + vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT, + vgic_mmio_read_raz, vgic_mmio_write_sgir, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR, + vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET, + vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), +}; + +static const struct vgic_register_region vgic_v2_cpu_registers[] = { + REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO, + vgic_mmio_read_apr, vgic_mmio_write_apr, 16, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), +}; + +unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v2_dist_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return SZ_4K; +} + +int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + struct vgic_io_device iodev; + struct vgic_reg_attr reg_attr; + struct kvm_vcpu *vcpu; + gpa_t addr; + int ret; + + ret = vgic_v2_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + iodev.regions = vgic_v2_dist_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + iodev.base_addr = 0; + break; + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + iodev.regions = vgic_v2_cpu_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + iodev.base_addr = 0; + break; + default: + return -ENXIO; + } + + /* We only support aligned 32-bit accesses. */ + if (addr & 3) + return -ENXIO; + + region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); + if (!region) + return -ENXIO; + + return 0; +} + +int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v2_cpu_registers, + .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers), + .iodev_type = IODEV_CPUIF, + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} + +int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v2_dist_registers, + .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers), + .iodev_type = IODEV_DIST, + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c new file mode 100644 index 000000000000..89a14ec8b33b --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -0,0 +1,1063 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGICv3 MMIO handling functions + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "vgic.h" +#include "vgic-mmio.h" + +/* extract @num bytes at @offset bytes offset in data */ +unsigned long extract_bytes(u64 data, unsigned int offset, + unsigned int num) +{ + return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); +} + +/* allows updates of any half of a 64-bit register (or the whole thing) */ +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val) +{ + int lower = (offset & 4) * 8; + int upper = lower + 8 * len - 1; + + reg &= ~GENMASK_ULL(upper, lower); + val &= GENMASK_ULL(len * 8 - 1, 0); + + return reg | ((u64)val << lower); +} + +bool vgic_has_its(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return false; + + return dist->has_its; +} + +bool vgic_supports_direct_msis(struct kvm *kvm) +{ + return (kvm_vgic_global_state.has_gicv4_1 || + (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm))); +} + +/* + * The Revision field in the IIDR have the following meanings: + * + * Revision 2: Interrupt groups are guest-configurable and signaled using + * their configured groups. + */ + +static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + u32 value = 0; + + switch (addr & 0x0c) { + case GICD_CTLR: + if (vgic->enabled) + value |= GICD_CTLR_ENABLE_SS_G1; + value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; + if (vgic->nassgireq) + value |= GICD_CTLR_nASSGIreq; + break; + case GICD_TYPER: + value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; + value = (value >> 5) - 1; + if (vgic_has_its(vcpu->kvm)) { + value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; + value |= GICD_TYPER_LPIS; + } else { + value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; + } + break; + case GICD_TYPER2: + if (kvm_vgic_global_state.has_gicv4_1) + value = GICD_TYPER2_nASSGIcap; + break; + case GICD_IIDR: + value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | + (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | + (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); + break; + default: + return 0; + } + + return value; +} + +static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + switch (addr & 0x0c) { + case GICD_CTLR: { + bool was_enabled, is_hwsgi; + + mutex_lock(&vcpu->kvm->lock); + + was_enabled = dist->enabled; + is_hwsgi = dist->nassgireq; + + dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; + + /* Not a GICv4.1? No HW SGIs */ + if (!kvm_vgic_global_state.has_gicv4_1) + val &= ~GICD_CTLR_nASSGIreq; + + /* Dist stays enabled? nASSGIreq is RO */ + if (was_enabled && dist->enabled) { + val &= ~GICD_CTLR_nASSGIreq; + val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi); + } + + /* Switching HW SGIs? */ + dist->nassgireq = val & GICD_CTLR_nASSGIreq; + if (is_hwsgi != dist->nassgireq) + vgic_v4_configure_vsgis(vcpu->kvm); + + if (kvm_vgic_global_state.has_gicv4_1 && + was_enabled != dist->enabled) + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4); + else if (!was_enabled && dist->enabled) + vgic_kick_vcpus(vcpu->kvm); + + mutex_unlock(&vcpu->kvm->lock); + break; + } + case GICD_TYPER: + case GICD_TYPER2: + case GICD_IIDR: + /* This is at best for documentation purposes... */ + return; + } +} + +static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + switch (addr & 0x0c) { + case GICD_TYPER2: + case GICD_IIDR: + if (val != vgic_mmio_read_v3_misc(vcpu, addr, len)) + return -EINVAL; + return 0; + case GICD_CTLR: + /* Not a GICv4.1? No HW SGIs */ + if (!kvm_vgic_global_state.has_gicv4_1) + val &= ~GICD_CTLR_nASSGIreq; + + dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; + dist->nassgireq = val & GICD_CTLR_nASSGIreq; + return 0; + } + + vgic_mmio_write_v3_misc(vcpu, addr, len, val); + return 0; +} + +static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + int intid = VGIC_ADDR_TO_INTID(addr, 64); + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); + unsigned long ret = 0; + + if (!irq) + return 0; + + /* The upper word is RAZ for us. */ + if (!(addr & 4)) + ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); + + vgic_put_irq(vcpu->kvm, irq); + return ret; +} + +static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int intid = VGIC_ADDR_TO_INTID(addr, 64); + struct vgic_irq *irq; + unsigned long flags; + + /* The upper word is WI for us since we don't implement Aff3. */ + if (addr & 4) + return; + + irq = vgic_get_irq(vcpu->kvm, NULL, intid); + + if (!irq) + return; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* We only care about and preserve Aff0, Aff1 and Aff2. */ + irq->mpidr = val & GENMASK(23, 0); + irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); +} + +static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0; +} + + +static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + bool was_enabled = vgic_cpu->lpis_enabled; + + if (!vgic_has_its(vcpu->kvm)) + return; + + vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; + + if (was_enabled && !vgic_cpu->lpis_enabled) { + vgic_flush_pending_lpis(vcpu); + vgic_its_invalidate_cache(vcpu->kvm); + } + + if (!was_enabled && vgic_cpu->lpis_enabled) + vgic_enable_lpis(vcpu); +} + +static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_redist_region *rdreg = vgic_cpu->rdreg; + int target_vcpu_id = vcpu->vcpu_id; + gpa_t last_rdist_typer = rdreg->base + GICR_TYPER + + (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE; + u64 value; + + value = (u64)(mpidr & GENMASK(23, 0)) << 32; + value |= ((target_vcpu_id & 0xffff) << 8); + + if (addr == last_rdist_typer) + value |= GICR_TYPER_LAST; + if (vgic_has_its(vcpu->kvm)) + value |= GICR_TYPER_PLPIS; + + return extract_bytes(value, addr & 7, len); +} + +static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); +} + +static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + switch (addr & 0xffff) { + case GICD_PIDR2: + /* report a GICv3 compliant implementation */ + return 0x3b; + } + + return 0; +} + +static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* + * pending state of interrupt is latched in pending_latch variable. + * Userspace will save and restore pending state and line_level + * separately. + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt + * for handling of ISPENDR and ICPENDR. + */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + bool state = irq->pending_latch; + + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + int err; + + err = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &state); + WARN_ON(err); + } + + if (state) + value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (test_bit(i, &val)) { + /* + * pending_latch is set irrespective of irq type + * (level or edge) to avoid dependency that VM should + * restore irq config before pending info. + */ + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + irq->pending_latch = false; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +/* We want to avoid outer shareable. */ +u64 vgic_sanitise_shareability(u64 field) +{ + switch (field) { + case GIC_BASER_OuterShareable: + return GIC_BASER_InnerShareable; + default: + return field; + } +} + +/* Avoid any inner non-cacheable mapping. */ +u64 vgic_sanitise_inner_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_nCnB: + case GIC_BASER_CACHE_nC: + return GIC_BASER_CACHE_RaWb; + default: + return field; + } +} + +/* Non-cacheable or same-as-inner are OK. */ +u64 vgic_sanitise_outer_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_SameAsInner: + case GIC_BASER_CACHE_nC: + return field; + default: + return GIC_BASER_CACHE_nC; + } +} + +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)) +{ + u64 field = (reg & field_mask) >> field_shift; + + field = sanitise_fn(field) << field_shift; + return (reg & ~field_mask) | field; +} + +#define PROPBASER_RES0_MASK \ + (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5)) +#define PENDBASER_RES0_MASK \ + (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \ + GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0)) + +static u64 vgic_sanitise_pendbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK, + GICR_PENDBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK, + GICR_PENDBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK, + GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PENDBASER_RES0_MASK; + + return reg; +} + +static u64 vgic_sanitise_propbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK, + GICR_PROPBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK, + GICR_PROPBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK, + GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PROPBASER_RES0_MASK; + return reg; +} + +static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return extract_bytes(dist->propbaser, addr & 7, len); +} + +static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 old_propbaser, propbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_cpu->lpis_enabled) + return; + + do { + old_propbaser = READ_ONCE(dist->propbaser); + propbaser = old_propbaser; + propbaser = update_64bit_reg(propbaser, addr & 4, len, val); + propbaser = vgic_sanitise_propbaser(propbaser); + } while (cmpxchg64(&dist->propbaser, old_propbaser, + propbaser) != old_propbaser); +} + +static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 value = vgic_cpu->pendbaser; + + value &= ~GICR_PENDBASER_PTZ; + + return extract_bytes(value, addr & 7, len); +} + +static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 old_pendbaser, pendbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_cpu->lpis_enabled) + return; + + do { + old_pendbaser = READ_ONCE(vgic_cpu->pendbaser); + pendbaser = old_pendbaser; + pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); + pendbaser = vgic_sanitise_pendbaser(pendbaser); + } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser, + pendbaser) != old_pendbaser); +} + +/* + * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the + * redistributors, while SPIs are covered by registers in the distributor + * block. Trying to set private IRQs in this block gets ignored. + * We take some special care here to fix the calculation of the register + * offset. + */ +#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = bpi, \ + .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ + .access_flags = acc, \ + .read = vgic_mmio_read_raz, \ + .write = vgic_mmio_write_wi, \ + }, { \ + .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ + .bits_per_irq = bpi, \ + .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = ur, \ + .uaccess_write = uw, \ + } + +static const struct vgic_register_region vgic_v3_dist_registers[] = { + REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR, + vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, + NULL, vgic_mmio_uaccess_write_v3_misc, + 16, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICD_STATUSR, + vgic_mmio_read_rao, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, + vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, + vgic_mmio_read_pending, vgic_mmio_write_spending, + vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, + vgic_mmio_read_active, vgic_mmio_write_sactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, + vgic_mmio_read_active, vgic_mmio_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, + 1, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, + vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, + 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, + vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, + vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, + vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, + vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, + vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, + VGIC_ACCESS_32bit), +}; + +static const struct vgic_register_region vgic_v3_rd_registers[] = { + /* RD_base registers */ + REGISTER_DESC_WITH_LENGTH(GICR_CTLR, + vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_STATUSR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IIDR, + vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_TYPER, + vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_WAKER, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, + vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, + vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, + vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, + VGIC_ACCESS_32bit), + /* SGI_base registers */ + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0, + vgic_mmio_read_group, vgic_mmio_write_group, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_spending, + vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0, + vgic_mmio_read_active, vgic_mmio_write_sactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0, + vgic_mmio_read_active, vgic_mmio_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0, + vgic_mmio_read_priority, vgic_mmio_write_priority, 32, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0, + vgic_mmio_read_config, vgic_mmio_write_config, 8, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), +}; + +unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v3_dist_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return SZ_64K; +} + +/** + * vgic_register_redist_iodev - register a single redist iodev + * @vcpu: The VCPU to which the redistributor belongs + * + * Register a KVM iodev for this VCPU's redistributor using the address + * provided. + * + * Return 0 on success, -ERRNO otherwise. + */ +int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct vgic_dist *vgic = &kvm->arch.vgic; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; + struct vgic_redist_region *rdreg; + gpa_t rd_base; + int ret; + + if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) + return 0; + + /* + * We may be creating VCPUs before having set the base address for the + * redistributor region, in which case we will come back to this + * function for all VCPUs when the base address is set. Just return + * without doing any work for now. + */ + rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions); + if (!rdreg) + return 0; + + if (!vgic_v3_check_base(kvm)) + return -EINVAL; + + vgic_cpu->rdreg = rdreg; + + rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE; + + kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); + rd_dev->base_addr = rd_base; + rd_dev->iodev_type = IODEV_REDIST; + rd_dev->regions = vgic_v3_rd_registers; + rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); + rd_dev->redist_vcpu = vcpu; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, + 2 * SZ_64K, &rd_dev->dev); + mutex_unlock(&kvm->slots_lock); + + if (ret) + return ret; + + rdreg->free_index++; + return 0; +} + +static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) +{ + struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; + + kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev); +} + +static int vgic_register_all_redist_iodevs(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int c, ret = 0; + + kvm_for_each_vcpu(c, vcpu, kvm) { + ret = vgic_register_redist_iodev(vcpu); + if (ret) + break; + } + + if (ret) { + /* The current c failed, so we start with the previous one. */ + mutex_lock(&kvm->slots_lock); + for (c--; c >= 0; c--) { + vcpu = kvm_get_vcpu(kvm, c); + vgic_unregister_redist_iodev(vcpu); + } + mutex_unlock(&kvm->slots_lock); + } + + return ret; +} + +/** + * vgic_v3_insert_redist_region - Insert a new redistributor region + * + * Performs various checks before inserting the rdist region in the list. + * Those tests depend on whether the size of the rdist region is known + * (ie. count != 0). The list is sorted by rdist region index. + * + * @kvm: kvm handle + * @index: redist region index + * @base: base of the new rdist region + * @count: number of redistributors the region is made of (0 in the old style + * single region, whose size is induced from the number of vcpus) + * + * Return 0 on success, < 0 otherwise + */ +static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index, + gpa_t base, uint32_t count) +{ + struct vgic_dist *d = &kvm->arch.vgic; + struct vgic_redist_region *rdreg; + struct list_head *rd_regions = &d->rd_regions; + size_t size = count * KVM_VGIC_V3_REDIST_SIZE; + int ret; + + /* single rdist region already set ?*/ + if (!count && !list_empty(rd_regions)) + return -EINVAL; + + /* cross the end of memory ? */ + if (base + size < base) + return -EINVAL; + + if (list_empty(rd_regions)) { + if (index != 0) + return -EINVAL; + } else { + rdreg = list_last_entry(rd_regions, + struct vgic_redist_region, list); + if (index != rdreg->index + 1) + return -EINVAL; + + /* Cannot add an explicitly sized regions after legacy region */ + if (!rdreg->count) + return -EINVAL; + } + + /* + * For legacy single-region redistributor regions (!count), + * check that the redistributor region does not overlap with the + * distributor's address space. + */ + if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && + vgic_dist_overlap(kvm, base, size)) + return -EINVAL; + + /* collision with any other rdist region? */ + if (vgic_v3_rdist_overlap(kvm, base, size)) + return -EINVAL; + + rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL); + if (!rdreg) + return -ENOMEM; + + rdreg->base = VGIC_ADDR_UNDEF; + + ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K); + if (ret) + goto free; + + rdreg->base = base; + rdreg->count = count; + rdreg->free_index = 0; + rdreg->index = index; + + list_add_tail(&rdreg->list, rd_regions); + return 0; +free: + kfree(rdreg); + return ret; +} + +int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) +{ + int ret; + + ret = vgic_v3_insert_redist_region(kvm, index, addr, count); + if (ret) + return ret; + + /* + * Register iodevs for each existing VCPU. Adding more VCPUs + * afterwards will register the iodevs when needed. + */ + ret = vgic_register_all_redist_iodevs(kvm); + if (ret) + return ret; + + return 0; +} + +int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + struct vgic_io_device iodev; + struct vgic_reg_attr reg_attr; + struct kvm_vcpu *vcpu; + gpa_t addr; + int ret; + + ret = vgic_v3_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + iodev.regions = vgic_v3_dist_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); + iodev.base_addr = 0; + break; + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{ + iodev.regions = vgic_v3_rd_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); + iodev.base_addr = 0; + break; + } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 reg, id; + + id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); + return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, ®); + } + default: + return -ENXIO; + } + + /* We only support aligned 32-bit accesses. */ + if (addr & 3) + return -ENXIO; + + region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); + if (!region) + return -ENXIO; + + return 0; +} +/* + * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI + * generation register ICC_SGI1R_EL1) with a given VCPU. + * If the VCPU's MPIDR matches, return the level0 affinity, otherwise + * return -1. + */ +static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) +{ + unsigned long affinity; + int level0; + + /* + * Split the current VCPU's MPIDR into affinity level 0 and the + * rest as this is what we have to compare against. + */ + affinity = kvm_vcpu_get_mpidr_aff(vcpu); + level0 = MPIDR_AFFINITY_LEVEL(affinity, 0); + affinity &= ~MPIDR_LEVEL_MASK; + + /* bail out if the upper three levels don't match */ + if (sgi_aff != affinity) + return -1; + + /* Is this VCPU's bit set in the mask ? */ + if (!(sgi_cpu_mask & BIT(level0))) + return -1; + + return level0; +} + +/* + * The ICC_SGI* registers encode the affinity differently from the MPIDR, + * so provide a wrapper to use the existing defines to isolate a certain + * affinity level. + */ +#define SGI_AFFINITY_LEVEL(reg, level) \ + ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ + >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) + +/** + * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs + * @vcpu: The VCPU requesting a SGI + * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU + * @allow_group1: Does the sysreg access allow generation of G1 SGIs + * + * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. + * This will trap in sys_regs.c and call this function. + * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the + * target processors as well as a bitmask of 16 Aff0 CPUs. + * If the interrupt routing mode bit is not set, we iterate over all VCPUs to + * check for matching ones. If this bit is set, we signal all, but not the + * calling VCPU. + */ +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *c_vcpu; + u16 target_cpus; + u64 mpidr; + int sgi, c; + int vcpu_id = vcpu->vcpu_id; + bool broadcast; + unsigned long flags; + + sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; + broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); + target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; + mpidr = SGI_AFFINITY_LEVEL(reg, 3); + mpidr |= SGI_AFFINITY_LEVEL(reg, 2); + mpidr |= SGI_AFFINITY_LEVEL(reg, 1); + + /* + * We iterate over all VCPUs to find the MPIDRs matching the request. + * If we have handled one CPU, we clear its bit to detect early + * if we are already finished. This avoids iterating through all + * VCPUs when most of the times we just signal a single VCPU. + */ + kvm_for_each_vcpu(c, c_vcpu, kvm) { + struct vgic_irq *irq; + + /* Exit early if we have dealt with all requested CPUs */ + if (!broadcast && target_cpus == 0) + break; + + /* Don't signal the calling VCPU */ + if (broadcast && c == vcpu_id) + continue; + + if (!broadcast) { + int level0; + + level0 = match_mpidr(mpidr, target_cpus, c_vcpu); + if (level0 == -1) + continue; + + /* remove this matching VCPU from the mask */ + target_cpus &= ~BIT(level0); + } + + irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* + * An access targetting Group0 SGIs can only generate + * those, while an access targetting Group1 SGIs can + * generate interrupts of either group. + */ + if (!irq->group || allow_group1) { + if (!irq->hw) { + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + /* HW SGI? Ask the GIC to inject it */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + true); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + } else { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + + vgic_put_irq(vcpu->kvm, irq); + } +} + +int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v3_dist_registers, + .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers), + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} + +int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device rd_dev = { + .regions = vgic_v3_rd_registers, + .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers), + }; + + return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); +} + +int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u32 intid, u64 *val) +{ + if (intid % 32) + return -EINVAL; + + if (is_write) + vgic_write_irq_line_level_info(vcpu, intid, *val); + else + *val = vgic_read_irq_line_level_info(vcpu, intid); + + return 0; +} diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c new file mode 100644 index 000000000000..b2d73fc0d1ef --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -0,0 +1,1088 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGIC MMIO handling functions + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vgic.h" +#include "vgic-mmio.h" + +unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return 0; +} + +unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return -1UL; +} + +void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + /* Ignore */ +} + +int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + /* Ignore */ + return 0; +} + +unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->group) + value |= BIT(i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +static void vgic_update_vsgi(struct vgic_irq *irq) +{ + WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group)); +} + +void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->group = !!(val & BIT(i)); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + vgic_update_vsgi(irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } else { + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } + + vgic_put_irq(vcpu->kvm, irq); + } +} + +/* + * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value + * of the enabled bit, so there is only one function for both here. + */ +unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->enabled) + value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + if (!irq->enabled) { + struct irq_data *data; + + irq->enabled = true; + data = &irq_to_desc(irq->host_irq)->irq_data; + while (irqd_irq_disabled(data)) + enable_irq(irq->host_irq); + } + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + continue; + } else if (vgic_irq_is_mapped_level(irq)) { + bool was_high = irq->line_level; + + /* + * We need to update the state of the interrupt because + * the guest might have changed the state of the device + * while the interrupt was disabled at the VGIC level. + */ + irq->line_level = vgic_get_phys_line_level(irq); + /* + * Deactivate the physical interrupt so the GIC will let + * us know when it is asserted again. + */ + if (!irq->active && was_high && !irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + irq->enabled = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled) + disable_irq_nosync(irq->host_irq); + + irq->enabled = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->enabled = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->enabled = false; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + unsigned long flags; + bool val; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + int err; + + val = false; + err = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &val); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + } else { + val = irq_is_pending(irq); + } + + value |= ((u32)val << i); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) +{ + return (vgic_irq_is_sgi(irq->intid) && + vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2); +} + +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* GICD_ISPENDR0 SGI bits are WI */ + if (is_vgic_v2_sgi(vcpu, irq)) { + vgic_put_irq(vcpu->kvm, irq); + continue; + } + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + /* HW SGI? Ask the GIC to inject it */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + true); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + continue; + } + + irq->pending_latch = true; + if (irq->hw) + vgic_irq_set_phys_active(irq, true); + + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + + /* + * GICv2 SGIs are terribly broken. We can't restore + * the source of the interrupt, so just pick the vcpu + * itself as the source... + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source |= BIT(vcpu->vcpu_id); + + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq) +{ + irq->pending_latch = false; + + /* + * We don't want the guest to effectively mask the physical + * interrupt by doing a write to SPENDR followed by a write to + * CPENDR for HW interrupts, so we clear the active state on + * the physical side if the virtual interrupt is not active. + * This may lead to taking an additional interrupt on the + * host, but that should not be a problem as the worst that + * can happen is an additional vgic injection. We also clear + * the pending state to maintain proper semantics for edge HW + * interrupts. + */ + vgic_irq_set_phys_pending(irq, false); + if (!irq->active) + vgic_irq_set_phys_active(irq, false); +} + +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* GICD_ICPENDR0 SGI bits are WI */ + if (is_vgic_v2_sgi(vcpu, irq)) { + vgic_put_irq(vcpu->kvm, irq); + continue; + } + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + /* HW SGI? Ask the GIC to clear its pending bit */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + false); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + continue; + } + + if (irq->hw) + vgic_hw_irq_cpending(vcpu, irq); + else + irq->pending_latch = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* + * More fun with GICv2 SGIs! If we're clearing one of them + * from userspace, which source vcpu to clear? Let's not + * even think of it, and blow the whole set. + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source = 0; + + irq->pending_latch = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +/* + * If we are fiddling with an IRQ's active state, we have to make sure the IRQ + * is not queued on some running VCPU's LRs, because then the change to the + * active state can be overwritten when the VCPU's state is synced coming back + * from the guest. + * + * For shared interrupts as well as GICv3 private interrupts, we have to + * stop all the VCPUs because interrupts can be migrated while we don't hold + * the IRQ locks and we don't want to be chasing moving targets. + * + * For GICv2 private interrupts we don't have to do anything because + * userspace accesses to the VGIC state already require all VCPUs to be + * stopped, and only the VCPU itself can modify its private interrupts + * active state, which guarantees that the VCPU is not running. + */ +static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) +{ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid >= VGIC_NR_PRIVATE_IRQS) + kvm_arm_halt_guest(vcpu->kvm); +} + +/* See vgic_access_active_prepare */ +static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid) +{ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid >= VGIC_NR_PRIVATE_IRQS) + kvm_arm_resume_guest(vcpu->kvm); +} + +static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* + * Even for HW interrupts, don't evaluate the HW state as + * all the guest is interested in is the virtual state. + */ + if (irq->active) + value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 val; + + mutex_lock(&vcpu->kvm->lock); + vgic_access_active_prepare(vcpu, intid); + + val = __vgic_mmio_read_active(vcpu, addr, len); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->lock); + + return val; +} + +unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return __vgic_mmio_read_active(vcpu, addr, len); +} + +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool active, bool is_uaccess) +{ + if (is_uaccess) + return; + + irq->active = active; + vgic_irq_set_phys_active(irq, active); +} + +static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool active) +{ + unsigned long flags; + struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu(); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw && !vgic_irq_is_sgi(irq->intid)) { + vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); + } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + /* + * GICv4.1 VSGI feature doesn't track an active state, + * so let's not kid ourselves, there is nothing we can + * do here. + */ + irq->active = false; + } else { + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u8 active_source; + + irq->active = active; + + /* + * The GICv2 architecture indicates that the source CPUID for + * an SGI should be provided during an EOI which implies that + * the active state is stored somewhere, but at the same time + * this state is not architecturally exposed anywhere and we + * have no way of knowing the right source. + * + * This may lead to a VCPU not being able to receive + * additional instances of a particular SGI after migration + * for a GICv2 VM on some GIC implementations. Oh well. + */ + active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && + active && vgic_irq_is_sgi(irq->intid)) + irq->active_source = active_source; + } + + if (irq->active) + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + else + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); +} + +static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + vgic_mmio_change_active(vcpu, irq, false); + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + + mutex_lock(&vcpu->kvm->lock); + vgic_access_active_prepare(vcpu, intid); + + __vgic_mmio_write_cactive(vcpu, addr, len, val); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->lock); +} + +int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __vgic_mmio_write_cactive(vcpu, addr, len, val); + return 0; +} + +static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + vgic_mmio_change_active(vcpu, irq, true); + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + + mutex_lock(&vcpu->kvm->lock); + vgic_access_active_prepare(vcpu, intid); + + __vgic_mmio_write_sactive(vcpu, addr, len, val); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->lock); +} + +int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __vgic_mmio_write_sactive(vcpu, addr, len, val); + return 0; +} + +unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + val |= (u64)irq->priority << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +/* + * We currently don't handle changing the priority of an interrupt that + * is already pending on a VCPU. If there is a need for this, we would + * need to make this VCPU exit and re-evaluate the priorities, potentially + * leading to this interrupt getting presented now to the guest (if it has + * been masked by the priority mask before). + */ +void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + unsigned long flags; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* Narrow the priority range to what we actually support */ + irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) + vgic_update_vsgi(irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + +unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 2); + u32 value = 0; + int i; + + for (i = 0; i < len * 4; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->config == VGIC_CONFIG_EDGE) + value |= (2U << (i * 2)); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +void vgic_mmio_write_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 2); + int i; + unsigned long flags; + + for (i = 0; i < len * 4; i++) { + struct vgic_irq *irq; + + /* + * The configuration cannot be changed for SGIs in general, + * for PPIs this is IMPLEMENTATION DEFINED. The arch timer + * code relies on PPIs being level triggered, so we also + * make them read-only here. + */ + if (intid + i < VGIC_NR_PRIVATE_IRQS) + continue; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (test_bit(i * 2 + 1, &val)) + irq->config = VGIC_CONFIG_EDGE; + else + irq->config = VGIC_CONFIG_LEVEL; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) +{ + int i; + u64 val = 0; + int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + + for (i = 0; i < 32; i++) { + struct vgic_irq *irq; + + if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) + continue; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) + val |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, + const u64 val) +{ + int i; + int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + unsigned long flags; + + for (i = 0; i < 32; i++) { + struct vgic_irq *irq; + bool new_level; + + if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) + continue; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* + * Line level is set irrespective of irq type + * (level or edge) to avoid dependency that VM should + * restore irq config before line level. + */ + new_level = !!(val & (1U << i)); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->line_level = new_level; + if (new_level) + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + else + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + +static int match_region(const void *key, const void *elt) +{ + const unsigned int offset = (unsigned long)key; + const struct vgic_register_region *region = elt; + + if (offset < region->reg_offset) + return -1; + + if (offset >= region->reg_offset + region->len) + return 1; + + return 0; +} + +const struct vgic_register_region * +vgic_find_mmio_region(const struct vgic_register_region *regions, + int nr_regions, unsigned int offset) +{ + return bsearch((void *)(uintptr_t)offset, regions, nr_regions, + sizeof(regions[0]), match_region); +} + +void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_vmcr(vcpu, vmcr); + else + vgic_v3_set_vmcr(vcpu, vmcr); +} + +void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_get_vmcr(vcpu, vmcr); + else + vgic_v3_get_vmcr(vcpu, vmcr); +} + +/* + * kvm_mmio_read_buf() returns a value in a format where it can be converted + * to a byte array and be directly observed as the guest wanted it to appear + * in memory if it had done the store itself, which is LE for the GIC, as the + * guest knows the GIC is always LE. + * + * We convert this value to the CPUs native format to deal with it as a data + * value. + */ +unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len) +{ + unsigned long data = kvm_mmio_read_buf(val, len); + + switch (len) { + case 1: + return data; + case 2: + return le16_to_cpu(data); + case 4: + return le32_to_cpu(data); + default: + return le64_to_cpu(data); + } +} + +/* + * kvm_mmio_write_buf() expects a value in a format such that if converted to + * a byte array it is observed as the guest would see it if it could perform + * the load directly. Since the GIC is LE, and the guest knows this, the + * guest expects a value in little endian format. + * + * We convert the data value from the CPUs native format to LE so that the + * value is returned in the proper format. + */ +void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, + unsigned long data) +{ + switch (len) { + case 1: + break; + case 2: + data = cpu_to_le16(data); + break; + case 4: + data = cpu_to_le32(data); + break; + default: + data = cpu_to_le64(data); + } + + kvm_mmio_write_buf(buf, len, data); +} + +static +struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev) +{ + return container_of(dev, struct vgic_io_device, dev); +} + +static bool check_region(const struct kvm *kvm, + const struct vgic_register_region *region, + gpa_t addr, int len) +{ + int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + + switch (len) { + case sizeof(u8): + flags = VGIC_ACCESS_8bit; + break; + case sizeof(u32): + flags = VGIC_ACCESS_32bit; + break; + case sizeof(u64): + flags = VGIC_ACCESS_64bit; + break; + default: + return false; + } + + if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) { + if (!region->bits_per_irq) + return true; + + /* Do we access a non-allocated IRQ? */ + return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs; + } + + return false; +} + +const struct vgic_register_region * +vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, int len) +{ + const struct vgic_register_region *region; + + region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, + addr - iodev->base_addr); + if (!region || !check_region(vcpu->kvm, region, addr, len)) + return NULL; + + return region; +} + +static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, u32 *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + + region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); + if (!region) { + *val = 0; + return 0; + } + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + if (region->uaccess_read) + *val = region->uaccess_read(r_vcpu, addr, sizeof(u32)); + else + *val = region->read(r_vcpu, addr, sizeof(u32)); + + return 0; +} + +static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, const u32 *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + + region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); + if (!region) + return 0; + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + if (region->uaccess_write) + return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); + + region->write(r_vcpu, addr, sizeof(u32), *val); + return 0; +} + +/* + * Userland access to VGIC registers. + */ +int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, + bool is_write, int offset, u32 *val) +{ + if (is_write) + return vgic_uaccess_write(vcpu, &dev->dev, offset, val); + else + return vgic_uaccess_read(vcpu, &dev->dev, offset, val); +} + +static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + unsigned long data = 0; + + region = vgic_get_mmio_region(vcpu, iodev, addr, len); + if (!region) { + memset(val, 0, len); + return 0; + } + + switch (iodev->iodev_type) { + case IODEV_CPUIF: + data = region->read(vcpu, addr, len); + break; + case IODEV_DIST: + data = region->read(vcpu, addr, len); + break; + case IODEV_REDIST: + data = region->read(iodev->redist_vcpu, addr, len); + break; + case IODEV_ITS: + data = region->its_read(vcpu->kvm, iodev->its, addr, len); + break; + } + + vgic_data_host_to_mmio_bus(val, len, data); + return 0; +} + +static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + unsigned long data = vgic_data_mmio_bus_to_host(val, len); + + region = vgic_get_mmio_region(vcpu, iodev, addr, len); + if (!region) + return 0; + + switch (iodev->iodev_type) { + case IODEV_CPUIF: + region->write(vcpu, addr, len, data); + break; + case IODEV_DIST: + region->write(vcpu, addr, len, data); + break; + case IODEV_REDIST: + region->write(iodev->redist_vcpu, addr, len, data); + break; + case IODEV_ITS: + region->its_write(vcpu->kvm, iodev->its, addr, len, data); + break; + } + + return 0; +} + +struct kvm_io_device_ops kvm_io_gic_ops = { + .read = dispatch_mmio_read, + .write = dispatch_mmio_write, +}; + +int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, + enum vgic_type type) +{ + struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; + int ret = 0; + unsigned int len; + + switch (type) { + case VGIC_V2: + len = vgic_v2_init_dist_iodev(io_device); + break; + case VGIC_V3: + len = vgic_v3_init_dist_iodev(io_device); + break; + default: + BUG_ON(1); + } + + io_device->base_addr = dist_base_address; + io_device->iodev_type = IODEV_DIST; + io_device->redist_vcpu = NULL; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, + len, &io_device->dev); + mutex_unlock(&kvm->slots_lock); + + return ret; +} diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h new file mode 100644 index 000000000000..fefcca2b14dc --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ +#ifndef __KVM_ARM_VGIC_MMIO_H__ +#define __KVM_ARM_VGIC_MMIO_H__ + +struct vgic_register_region { + unsigned int reg_offset; + unsigned int len; + unsigned int bits_per_irq; + unsigned int access_flags; + union { + unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len); + }; + union { + void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + void (*its_write)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val); + }; + unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + union { + int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val); + }; +}; + +extern struct kvm_io_device_ops kvm_io_gic_ops; + +#define VGIC_ACCESS_8bit 1 +#define VGIC_ACCESS_32bit 2 +#define VGIC_ACCESS_64bit 4 + +/* + * Generate a mask that covers the number of bytes required to address + * up to 1024 interrupts, each represented by bits. This assumes + * that is a power of two. + */ +#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1) + +/* + * (addr & mask) gives us the _byte_ offset for the INT ID. + * We multiply this by 8 the get the _bit_ offset, then divide this by + * the number of bits to learn the actual INT ID. + * But instead of a division (which requires a "long long div" implementation), + * we shift by the binary logarithm of . + * This assumes that is a power of two. + */ +#define VGIC_ADDR_TO_INTID(addr, bits) (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \ + 8 >> ilog2(bits)) + +/* + * Some VGIC registers store per-IRQ information, with a different number + * of bits per IRQ. For those registers this macro is used. + * The _WITH_LENGTH version instantiates registers with a fixed length + * and is mutually exclusive with the _PER_IRQ version. + */ +#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = bpi, \ + .len = bpi * 1024 / 8, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = ur, \ + .uaccess_write = uw, \ + } + +#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = 0, \ + .len = length, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + } + +#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = 0, \ + .len = length, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = urd, \ + .uaccess_write = uwr, \ + } + +unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); + +void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, + unsigned long data); + +unsigned long extract_bytes(u64 data, unsigned int offset, + unsigned int num); + +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + +void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, + bool is_write, int offset, u32 *val); + +u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); + +void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, + const u64 val); + +unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); + +unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); + +u64 vgic_sanitise_outer_cacheability(u64 reg); +u64 vgic_sanitise_inner_cacheability(u64 reg); +u64 vgic_sanitise_shareability(u64 reg); +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)); + +/* Find the proper register handler entry given a certain address offset */ +const struct vgic_register_region * +vgic_find_mmio_region(const struct vgic_register_region *regions, + int nr_regions, unsigned int offset); + +#endif diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c new file mode 100644 index 000000000000..621cc168fe3f --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -0,0 +1,504 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include +#include +#include +#include +#include + +#include "vgic.h" + +static inline void vgic_v2_write_lr(int lr, u32 val) +{ + void __iomem *base = kvm_vgic_global_state.vctrl_base; + + writel_relaxed(val, base + GICH_LR0 + (lr * 4)); +} + +void vgic_v2_init_lrs(void) +{ + int i; + + for (i = 0; i < kvm_vgic_global_state.nr_lr; i++) + vgic_v2_write_lr(i, 0); +} + +void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + + cpuif->vgic_hcr |= GICH_HCR_UIE; +} + +static bool lr_signals_eoi_mi(u32 lr_val) +{ + return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) && + !(lr_val & GICH_LR_HW); +} + +/* + * transfer the content of the LRs back into the corresponding ap_list: + * - active bit is transferred as is + * - pending bit is + * - transferred as is in case of edge sensitive IRQs + * - set to the line-level (resample time) for level sensitive IRQs + */ +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + int lr; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + cpuif->vgic_hcr &= ~GICH_HCR_UIE; + + for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { + u32 val = cpuif->vgic_lr[lr]; + u32 cpuid, intid = val & GICH_LR_VIRTUALID; + struct vgic_irq *irq; + + /* Extract the source vCPU id from the LR */ + cpuid = val & GICH_LR_PHYSID_CPUID; + cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; + cpuid &= 7; + + /* Notify fds when the guest EOI'ed a level-triggered SPI */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + + raw_spin_lock(&irq->irq_lock); + + /* Always preserve the active bit */ + irq->active = !!(val & GICH_LR_ACTIVE_BIT); + + if (irq->active && vgic_irq_is_sgi(intid)) + irq->active_source = cpuid; + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & GICH_LR_PENDING_BIT)) { + irq->pending_latch = true; + + if (vgic_irq_is_sgi(intid)) + irq->source |= (1 << cpuid); + } + + /* + * Clear soft pending state when level irqs have been acked. + */ + if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) + irq->pending_latch = false; + + /* + * Level-triggered mapped IRQs are special because we only + * observe rising edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample + * the physical line and set the line level, because the + * device state could have changed or we simply need to + * process the still pending interrupt later. + * + * If this causes us to lower the level, we have to also clear + * the physical active state, since we will otherwise never be + * told when the interrupt becomes asserted again. + */ + if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) { + irq->line_level = vgic_get_phys_line_level(irq); + + if (!irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + + raw_spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); + } + + vgic_cpu->used_lrs = 0; +} + +/* + * Populates the particular LR with the state of a given IRQ: + * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq + * - for a level sensitive IRQ the pending state value is unchanged; + * it is dictated directly by the input level + * + * If @irq describes an SGI with multiple sources, we choose the + * lowest-numbered source VCPU and clear that bit in the source bitmap. + * + * The irq_lock must be held by the caller. + */ +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 val = irq->intid; + bool allow_pending = true; + + if (irq->active) { + val |= GICH_LR_ACTIVE_BIT; + if (vgic_irq_is_sgi(irq->intid)) + val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; + if (vgic_irq_is_multi_sgi(irq)) { + allow_pending = false; + val |= GICH_LR_EOI; + } + } + + if (irq->group) + val |= GICH_LR_GROUP1; + + if (irq->hw) { + val |= GICH_LR_HW; + val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active) + allow_pending = false; + } else { + if (irq->config == VGIC_CONFIG_LEVEL) { + val |= GICH_LR_EOI; + + /* + * Software resampling doesn't work very well + * if we allow P+A, so let's not do that. + */ + if (irq->active) + allow_pending = false; + } + } + + if (allow_pending && irq_is_pending(irq)) { + val |= GICH_LR_PENDING_BIT; + + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch = false; + + if (vgic_irq_is_sgi(irq->intid)) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + irq->source &= ~(1 << (src - 1)); + if (irq->source) { + irq->pending_latch = true; + val |= GICH_LR_EOI; + } + } + } + + /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v2_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) + irq->line_level = false; + + /* The GICv2 LR only holds five bits of priority. */ + val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; +} + +void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0; +} + +void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u32 vmcr; + + vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) & + GICH_VMCR_ENABLE_GRP0_MASK; + vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) & + GICH_VMCR_ENABLE_GRP1_MASK; + vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) & + GICH_VMCR_ACK_CTL_MASK; + vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) & + GICH_VMCR_FIQ_EN_MASK; + vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) & + GICH_VMCR_CBPR_MASK; + vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) & + GICH_VMCR_EOI_MODE_MASK; + vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & + GICH_VMCR_ALIAS_BINPOINT_MASK; + vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & + GICH_VMCR_BINPOINT_MASK; + vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) << + GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; + + cpu_if->vgic_vmcr = vmcr; +} + +void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; + + vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >> + GICH_VMCR_ENABLE_GRP0_SHIFT; + vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >> + GICH_VMCR_ENABLE_GRP1_SHIFT; + vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >> + GICH_VMCR_ACK_CTL_SHIFT; + vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >> + GICH_VMCR_FIQ_EN_SHIFT; + vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >> + GICH_VMCR_CBPR_SHIFT; + vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >> + GICH_VMCR_EOI_MODE_SHIFT; + + vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> + GICH_VMCR_ALIAS_BINPOINT_SHIFT; + vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> + GICH_VMCR_BINPOINT_SHIFT; + vmcrp->pmr = ((vmcr & GICH_VMCR_PRIMASK_MASK) >> + GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; +} + +void vgic_v2_enable(struct kvm_vcpu *vcpu) +{ + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; + + /* Get the show on the road... */ + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; +} + +/* check for overlapping regions and for regions crossing the end of memory */ +static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) +{ + if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base) + return false; + if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base) + return false; + + if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base) + return true; + if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base) + return true; + + return false; +} + +int vgic_v2_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret = 0; + + if (vgic_ready(kvm)) + goto out; + + if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || + IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { + kvm_err("Need to set vgic cpu and dist addresses first\n"); + ret = -ENXIO; + goto out; + } + + if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) { + kvm_err("VGIC CPU and dist frames overlap\n"); + ret = -EINVAL; + goto out; + } + + /* + * Initialize the vgic if this hasn't already been done on demand by + * accessing the vgic state from userspace. + */ + ret = vgic_init(kvm); + if (ret) { + kvm_err("Unable to initialize VGIC dynamic data structures\n"); + goto out; + } + + ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2); + if (ret) { + kvm_err("Unable to register VGIC MMIO regions\n"); + goto out; + } + + if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { + ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, + kvm_vgic_global_state.vcpu_base, + KVM_VGIC_V2_CPU_SIZE, true); + if (ret) { + kvm_err("Unable to remap VGIC CPU to VCPU\n"); + goto out; + } + } + + dist->ready = true; + +out: + return ret; +} + +DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap); + +/** + * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller + * @info: pointer to the GIC description + * + * Returns 0 if the VGICv2 has been probed successfully, returns an error code + * otherwise + */ +int vgic_v2_probe(const struct gic_kvm_info *info) +{ + int ret; + u32 vtr; + + if (!info->vctrl.start) { + kvm_err("GICH not present in the firmware table\n"); + return -ENXIO; + } + + if (!PAGE_ALIGNED(info->vcpu.start) || + !PAGE_ALIGNED(resource_size(&info->vcpu))) { + kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n"); + + ret = create_hyp_io_mappings(info->vcpu.start, + resource_size(&info->vcpu), + &kvm_vgic_global_state.vcpu_base_va, + &kvm_vgic_global_state.vcpu_hyp_va); + if (ret) { + kvm_err("Cannot map GICV into hyp\n"); + goto out; + } + + static_branch_enable(&vgic_v2_cpuif_trap); + } + + ret = create_hyp_io_mappings(info->vctrl.start, + resource_size(&info->vctrl), + &kvm_vgic_global_state.vctrl_base, + &kvm_vgic_global_state.vctrl_hyp); + if (ret) { + kvm_err("Cannot map VCTRL into hyp\n"); + goto out; + } + + vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); + kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; + + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device\n"); + goto out; + } + + kvm_vgic_global_state.can_emulate_gicv2 = true; + kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.type = VGIC_V2; + kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; + + kvm_debug("vgic-v2@%llx\n", info->vctrl.start); + + return 0; +out: + if (kvm_vgic_global_state.vctrl_base) + iounmap(kvm_vgic_global_state.vctrl_base); + if (kvm_vgic_global_state.vcpu_base_va) + iounmap(kvm_vgic_global_state.vcpu_base_va); + + return ret; +} + +static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 elrsr; + int i; + + elrsr = readl_relaxed(base + GICH_ELRSR0); + if (unlikely(used_lrs > 32)) + elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32; + + for (i = 0; i < used_lrs; i++) { + if (elrsr & (1UL << i)) + cpu_if->vgic_lr[i] &= ~GICH_LR_STATE; + else + cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4)); + + writel_relaxed(0, base + GICH_LR0 + (i * 4)); + } +} + +void vgic_v2_save_state(struct kvm_vcpu *vcpu) +{ + void __iomem *base = kvm_vgic_global_state.vctrl_base; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + + if (!base) + return; + + if (used_lrs) { + save_lrs(vcpu, base); + writel_relaxed(0, base + GICH_HCR); + } +} + +void vgic_v2_restore_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + void __iomem *base = kvm_vgic_global_state.vctrl_base; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + int i; + + if (!base) + return; + + if (used_lrs) { + writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); + for (i = 0; i < used_lrs; i++) { + writel_relaxed(cpu_if->vgic_lr[i], + base + GICH_LR0 + (i * 4)); + } + } +} + +void vgic_v2_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + writel_relaxed(cpu_if->vgic_vmcr, + kvm_vgic_global_state.vctrl_base + GICH_VMCR); + writel_relaxed(cpu_if->vgic_apr, + kvm_vgic_global_state.vctrl_base + GICH_APR); +} + +void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); +} + +void vgic_v2_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + vgic_v2_vmcr_sync(vcpu); + cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); +} diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c new file mode 100644 index 000000000000..5bc2ab58954b --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -0,0 +1,691 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include + +#include "vgic.h" + +static bool group0_trap; +static bool group1_trap; +static bool common_trap; +static bool gicv4_enable; + +void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + + cpuif->vgic_hcr |= ICH_HCR_UIE; +} + +static bool lr_signals_eoi_mi(u64 lr_val) +{ + return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) && + !(lr_val & ICH_LR_HW); +} + +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + int lr; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + cpuif->vgic_hcr &= ~ICH_HCR_UIE; + + for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { + u64 val = cpuif->vgic_lr[lr]; + u32 intid, cpuid; + struct vgic_irq *irq; + bool is_v2_sgi = false; + + cpuid = val & GICH_LR_PHYSID_CPUID; + cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { + intid = val & ICH_LR_VIRTUAL_ID_MASK; + } else { + intid = val & GICH_LR_VIRTUALID; + is_v2_sgi = vgic_irq_is_sgi(intid); + } + + /* Notify fds when the guest EOI'ed a level-triggered IRQ */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + if (!irq) /* An LPI could have been unmapped. */ + continue; + + raw_spin_lock(&irq->irq_lock); + + /* Always preserve the active bit */ + irq->active = !!(val & ICH_LR_ACTIVE_BIT); + + if (irq->active && is_v2_sgi) + irq->active_source = cpuid; + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & ICH_LR_PENDING_BIT)) { + irq->pending_latch = true; + + if (is_v2_sgi) + irq->source |= (1 << cpuid); + } + + /* + * Clear soft pending state when level irqs have been acked. + */ + if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) + irq->pending_latch = false; + + /* + * Level-triggered mapped IRQs are special because we only + * observe rising edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample + * the physical line and set the line level, because the + * device state could have changed or we simply need to + * process the still pending interrupt later. + * + * If this causes us to lower the level, we have to also clear + * the physical active state, since we will otherwise never be + * told when the interrupt becomes asserted again. + */ + if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) { + irq->line_level = vgic_get_phys_line_level(irq); + + if (!irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + + raw_spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); + } + + vgic_cpu->used_lrs = 0; +} + +/* Requires the irq to be locked already */ +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u64 val = irq->intid; + bool allow_pending = true, is_v2_sgi; + + is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && + model == KVM_DEV_TYPE_ARM_VGIC_V2); + + if (irq->active) { + val |= ICH_LR_ACTIVE_BIT; + if (is_v2_sgi) + val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; + if (vgic_irq_is_multi_sgi(irq)) { + allow_pending = false; + val |= ICH_LR_EOI; + } + } + + if (irq->hw) { + val |= ICH_LR_HW; + val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active) + allow_pending = false; + } else { + if (irq->config == VGIC_CONFIG_LEVEL) { + val |= ICH_LR_EOI; + + /* + * Software resampling doesn't work very well + * if we allow P+A, so let's not do that. + */ + if (irq->active) + allow_pending = false; + } + } + + if (allow_pending && irq_is_pending(irq)) { + val |= ICH_LR_PENDING_BIT; + + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch = false; + + if (vgic_irq_is_sgi(irq->intid) && + model == KVM_DEV_TYPE_ARM_VGIC_V2) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + irq->source &= ~(1 << (src - 1)); + if (irq->source) { + irq->pending_latch = true; + val |= ICH_LR_EOI; + } + } + } + + /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v3_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) + irq->line_level = false; + + if (irq->group) + val |= ICH_LR_GROUP; + + val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; + + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; +} + +void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; +} + +void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u32 vmcr; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { + vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) & + ICH_VMCR_ACK_CTL_MASK; + vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) & + ICH_VMCR_FIQ_EN_MASK; + } else { + /* + * When emulating GICv3 on GICv3 with SRE=1 on the + * VFIQEn bit is RES1 and the VAckCtl bit is RES0. + */ + vmcr = ICH_VMCR_FIQ_EN_MASK; + } + + vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; + vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; + vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; + vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; + vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; + vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; + vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; + + cpu_if->vgic_vmcr = vmcr; +} + +void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { + vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >> + ICH_VMCR_ACK_CTL_SHIFT; + vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >> + ICH_VMCR_FIQ_EN_SHIFT; + } else { + /* + * When emulating GICv3 on GICv3 with SRE=1 on the + * VFIQEn bit is RES1 and the VAckCtl bit is RES0. + */ + vmcrp->fiqen = 1; + vmcrp->ackctl = 0; + } + + vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; + vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT; + vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; + vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; + vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; + vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; +} + +#define INITIAL_PENDBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) + +void vgic_v3_enable(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vgic_v3->vgic_vmcr = 0; + + /* + * If we are emulating a GICv3, we do it in an non-GICv2-compatible + * way, so we force SRE to 1 to demonstrate this to the guest. + * Also, we don't support any form of IRQ/FIQ bypass. + * This goes with the spec allowing the value to be RAO/WI. + */ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB | + ICC_SRE_EL1_DFB | + ICC_SRE_EL1_SRE); + vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; + } else { + vgic_v3->vgic_sre = 0; + } + + vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_ID_BITS_MASK) >> + ICH_VTR_ID_BITS_SHIFT; + vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_PRI_BITS_MASK) >> + ICH_VTR_PRI_BITS_SHIFT) + 1; + + /* Get the show on the road... */ + vgic_v3->vgic_hcr = ICH_HCR_EN; + if (group0_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TALL0; + if (group1_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TALL1; + if (common_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TC; +} + +int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) +{ + struct kvm_vcpu *vcpu; + int byte_offset, bit_nr; + gpa_t pendbase, ptr; + bool status; + u8 val; + int ret; + unsigned long flags; + +retry: + vcpu = irq->target_vcpu; + if (!vcpu) + return 0; + + pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + + byte_offset = irq->intid / BITS_PER_BYTE; + bit_nr = irq->intid % BITS_PER_BYTE; + ptr = pendbase + byte_offset; + + ret = kvm_read_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + + status = val & (1 << bit_nr); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->target_vcpu != vcpu) { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + goto retry; + } + irq->pending_latch = status; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + if (status) { + /* clear consumed data */ + val &= ~(1 << bit_nr); + ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + } + return 0; +} + +/** + * vgic_v3_save_pending_tables - Save the pending tables into guest RAM + * kvm lock and all vcpu lock must be held + */ +int vgic_v3_save_pending_tables(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + gpa_t last_ptr = ~(gpa_t)0; + int ret; + u8 val; + + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + int byte_offset, bit_nr; + struct kvm_vcpu *vcpu; + gpa_t pendbase, ptr; + bool stored; + + vcpu = irq->target_vcpu; + if (!vcpu) + continue; + + pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + + byte_offset = irq->intid / BITS_PER_BYTE; + bit_nr = irq->intid % BITS_PER_BYTE; + ptr = pendbase + byte_offset; + + if (ptr != last_ptr) { + ret = kvm_read_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + last_ptr = ptr; + } + + stored = val & (1U << bit_nr); + if (stored == irq->pending_latch) + continue; + + if (irq->pending_latch) + val |= 1 << bit_nr; + else + val &= ~(1 << bit_nr); + + ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + } + return 0; +} + +/** + * vgic_v3_rdist_overlap - check if a region overlaps with any + * existing redistributor region + * + * @kvm: kvm handle + * @base: base of the region + * @size: size of region + * + * Return: true if there is an overlap + */ +bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size) +{ + struct vgic_dist *d = &kvm->arch.vgic; + struct vgic_redist_region *rdreg; + + list_for_each_entry(rdreg, &d->rd_regions, list) { + if ((base + size > rdreg->base) && + (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg))) + return true; + } + return false; +} + +/* + * Check for overlapping regions and for regions crossing the end of memory + * for base addresses which have already been set. + */ +bool vgic_v3_check_base(struct kvm *kvm) +{ + struct vgic_dist *d = &kvm->arch.vgic; + struct vgic_redist_region *rdreg; + + if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && + d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) + return false; + + list_for_each_entry(rdreg, &d->rd_regions, list) { + if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) < + rdreg->base) + return false; + } + + if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base)) + return true; + + return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base, + KVM_VGIC_V3_DIST_SIZE); +} + +/** + * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one + * which has free space to put a new rdist region. + * + * @rd_regions: redistributor region list head + * + * A redistributor regions maps n redistributors, n = region size / (2 x 64kB). + * Stride between redistributors is 0 and regions are filled in the index order. + * + * Return: the redist region handle, if any, that has space to map a new rdist + * region. + */ +struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions) +{ + struct vgic_redist_region *rdreg; + + list_for_each_entry(rdreg, rd_regions, list) { + if (!vgic_v3_redist_region_full(rdreg)) + return rdreg; + } + return NULL; +} + +struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, + u32 index) +{ + struct list_head *rd_regions = &kvm->arch.vgic.rd_regions; + struct vgic_redist_region *rdreg; + + list_for_each_entry(rdreg, rd_regions, list) { + if (rdreg->index == index) + return rdreg; + } + return NULL; +} + + +int vgic_v3_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int ret = 0; + int c; + + if (vgic_ready(kvm)) + goto out; + + kvm_for_each_vcpu(c, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { + kvm_debug("vcpu %d redistributor base not set\n", c); + ret = -ENXIO; + goto out; + } + } + + if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { + kvm_err("Need to set vgic distributor addresses first\n"); + ret = -ENXIO; + goto out; + } + + if (!vgic_v3_check_base(kvm)) { + kvm_err("VGIC redist and dist frames overlap\n"); + ret = -EINVAL; + goto out; + } + + /* + * For a VGICv3 we require the userland to explicitly initialize + * the VGIC before we need to use it. + */ + if (!vgic_initialized(kvm)) { + ret = -EBUSY; + goto out; + } + + ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3); + if (ret) { + kvm_err("Unable to register VGICv3 dist MMIO regions\n"); + goto out; + } + + if (kvm_vgic_global_state.has_gicv4_1) + vgic_v4_configure_vsgis(kvm); + dist->ready = true; + +out: + return ret; +} + +DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); + +static int __init early_group0_trap_cfg(char *buf) +{ + return strtobool(buf, &group0_trap); +} +early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); + +static int __init early_group1_trap_cfg(char *buf) +{ + return strtobool(buf, &group1_trap); +} +early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); + +static int __init early_common_trap_cfg(char *buf) +{ + return strtobool(buf, &common_trap); +} +early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); + +static int __init early_gicv4_enable(char *buf) +{ + return strtobool(buf, &gicv4_enable); +} +early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); + +/** + * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller + * @info: pointer to the GIC description + * + * Returns 0 if the VGICv3 has been probed successfully, returns an error code + * otherwise + */ +int vgic_v3_probe(const struct gic_kvm_info *info) +{ + u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); + int ret; + + /* + * The ListRegs field is 5 bits, but there is a architectural + * maximum of 16 list registers. Just ignore bit 4... + */ + kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; + kvm_vgic_global_state.can_emulate_gicv2 = false; + kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; + + /* GICv4 support? */ + if (info->has_v4) { + kvm_vgic_global_state.has_gicv4 = gicv4_enable; + kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; + kvm_info("GICv4%s support %sabled\n", + kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", + gicv4_enable ? "en" : "dis"); + } + + if (!info->vcpu.start) { + kvm_info("GICv3: no GICV resource entry\n"); + kvm_vgic_global_state.vcpu_base = 0; + } else if (!PAGE_ALIGNED(info->vcpu.start)) { + pr_warn("GICV physical address 0x%llx not page aligned\n", + (unsigned long long)info->vcpu.start); + kvm_vgic_global_state.vcpu_base = 0; + } else { + kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.can_emulate_gicv2 = true; + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device.\n"); + return ret; + } + kvm_info("vgic-v2@%llx\n", info->vcpu.start); + } + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); + if (ret) { + kvm_err("Cannot register GICv3 KVM device.\n"); + kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); + return ret; + } + + if (kvm_vgic_global_state.vcpu_base == 0) + kvm_info("disabling GICv2 emulation\n"); + + if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) { + group0_trap = true; + group1_trap = true; + } + + if (group0_trap || group1_trap || common_trap) { + kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n", + group0_trap ? "G0" : "", + group1_trap ? "G1" : "", + common_trap ? "C" : ""); + static_branch_enable(&vgic_v3_cpuif_trap); + } + + kvm_vgic_global_state.vctrl_base = NULL; + kvm_vgic_global_state.type = VGIC_V3; + kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; + + return 0; +} + +void vgic_v3_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + /* + * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen + * is dependent on ICC_SRE_EL1.SRE, and we have to perform the + * VMCR_EL2 save/restore in the world switch. + */ + if (likely(cpu_if->vgic_sre)) + kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); + + kvm_call_hyp(__vgic_v3_restore_aprs, vcpu); + + if (has_vhe()) + __vgic_v3_activate_traps(vcpu); + + WARN_ON(vgic_v4_load(vcpu)); +} + +void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + if (likely(cpu_if->vgic_sre)) + cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); +} + +void vgic_v3_put(struct kvm_vcpu *vcpu) +{ + WARN_ON(vgic_v4_put(vcpu, false)); + + vgic_v3_vmcr_sync(vcpu); + + kvm_call_hyp(__vgic_v3_save_aprs, vcpu); + + if (has_vhe()) + __vgic_v3_deactivate_traps(vcpu); +} diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c new file mode 100644 index 000000000000..27ac833e5ec7 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 ARM Ltd. + * Author: Marc Zyngier + */ + +#include +#include +#include +#include +#include + +#include "vgic.h" + +/* + * How KVM uses GICv4 (insert rude comments here): + * + * The vgic-v4 layer acts as a bridge between several entities: + * - The GICv4 ITS representation offered by the ITS driver + * - VFIO, which is in charge of the PCI endpoint + * - The virtual ITS, which is the only thing the guest sees + * + * The configuration of VLPIs is triggered by a callback from VFIO, + * instructing KVM that a PCI device has been configured to deliver + * MSIs to a vITS. + * + * kvm_vgic_v4_set_forwarding() is thus called with the routing entry, + * and this is used to find the corresponding vITS data structures + * (ITS instance, device, event and irq) using a process that is + * extremely similar to the injection of an MSI. + * + * At this stage, we can link the guest's view of an LPI (uniquely + * identified by the routing entry) and the host irq, using the GICv4 + * driver mapping operation. Should the mapping succeed, we've then + * successfully upgraded the guest's LPI to a VLPI. We can then start + * with updating GICv4's view of the property table and generating an + * INValidation in order to kickstart the delivery of this VLPI to the + * guest directly, without software intervention. Well, almost. + * + * When the PCI endpoint is deconfigured, this operation is reversed + * with VFIO calling kvm_vgic_v4_unset_forwarding(). + * + * Once the VLPI has been mapped, it needs to follow any change the + * guest performs on its LPI through the vITS. For that, a number of + * command handlers have hooks to communicate these changes to the HW: + * - Any invalidation triggers a call to its_prop_update_vlpi() + * - The INT command results in a irq_set_irqchip_state(), which + * generates an INT on the corresponding VLPI. + * - The CLEAR command results in a irq_set_irqchip_state(), which + * generates an CLEAR on the corresponding VLPI. + * - DISCARD translates into an unmap, similar to a call to + * kvm_vgic_v4_unset_forwarding(). + * - MOVI is translated by an update of the existing mapping, changing + * the target vcpu, resulting in a VMOVI being generated. + * - MOVALL is translated by a string of mapping updates (similar to + * the handling of MOVI). MOVALL is horrible. + * + * Note that a DISCARD/MAPTI sequence emitted from the guest without + * reprogramming the PCI endpoint after MAPTI does not result in a + * VLPI being mapped, as there is no callback from VFIO (the guest + * will get the interrupt via the normal SW injection). Fixing this is + * not trivial, and requires some horrible messing with the VFIO + * internals. Not fun. Don't do that. + * + * Then there is the scheduling. Each time a vcpu is about to run on a + * physical CPU, KVM must tell the corresponding redistributor about + * it. And if we've migrated our vcpu from one CPU to another, we must + * tell the ITS (so that the messages reach the right redistributor). + * This is done in two steps: first issue a irq_set_affinity() on the + * irq corresponding to the vcpu, then call its_make_vpe_resident(). + * You must be in a non-preemptible context. On exit, a call to + * its_make_vpe_non_resident() tells the redistributor that we're done + * with the vcpu. + * + * Finally, the doorbell handling: Each vcpu is allocated an interrupt + * which will fire each time a VLPI is made pending whilst the vcpu is + * not running. Each time the vcpu gets blocked, the doorbell + * interrupt gets enabled. When the vcpu is unblocked (for whatever + * reason), the doorbell interrupt is disabled. + */ + +#define DB_IRQ_FLAGS (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING) + +static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info) +{ + struct kvm_vcpu *vcpu = info; + + /* We got the message, no need to fire again */ + if (!kvm_vgic_global_state.has_gicv4_1 && + !irqd_irq_disabled(&irq_to_desc(irq)->irq_data)) + disable_irq_nosync(irq); + + vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true; + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return IRQ_HANDLED; +} + +static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq) +{ + vpe->sgi_config[irq->intid].enabled = irq->enabled; + vpe->sgi_config[irq->intid].group = irq->group; + vpe->sgi_config[irq->intid].priority = irq->priority; +} + +static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + int i; + + /* + * With GICv4.1, every virtual SGI can be directly injected. So + * let's pretend that they are HW interrupts, tied to a host + * IRQ. The SGI code will do its magic. + */ + for (i = 0; i < VGIC_NR_SGIS; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); + struct irq_desc *desc; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw) + goto unlock; + + irq->hw = true; + irq->host_irq = irq_find_mapping(vpe->sgi_domain, i); + + /* Transfer the full irq state to the vPE */ + vgic_v4_sync_sgi_config(vpe, irq); + desc = irq_to_desc(irq->host_irq); + ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc), + false); + if (!WARN_ON(ret)) { + /* Transfer pending state */ + ret = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + irq->pending_latch); + WARN_ON(ret); + irq->pending_latch = false; + } + unlock: + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < VGIC_NR_SGIS; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); + struct irq_desc *desc; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (!irq->hw) + goto unlock; + + irq->hw = false; + ret = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &irq->pending_latch); + WARN_ON(ret); + + desc = irq_to_desc(irq->host_irq); + irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); + unlock: + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +/* Must be called with the kvm lock held */ +void vgic_v4_configure_vsgis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int i; + + kvm_arm_halt_guest(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (dist->nassgireq) + vgic_v4_enable_vsgis(vcpu); + else + vgic_v4_disable_vsgis(vcpu); + } + + kvm_arm_resume_guest(kvm); +} + +/** + * vgic_v4_init - Initialize the GICv4 data structures + * @kvm: Pointer to the VM being initialized + * + * We may be called each time a vITS is created, or when the + * vgic is initialized. This relies on kvm->lock to be + * held. In both cases, the number of vcpus should now be + * fixed. + */ +int vgic_v4_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int i, nr_vcpus, ret; + + if (!kvm_vgic_global_state.has_gicv4) + return 0; /* Nothing to see here... move along. */ + + if (dist->its_vm.vpes) + return 0; + + nr_vcpus = atomic_read(&kvm->online_vcpus); + + dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes), + GFP_KERNEL); + if (!dist->its_vm.vpes) + return -ENOMEM; + + dist->its_vm.nr_vpes = nr_vcpus; + + kvm_for_each_vcpu(i, vcpu, kvm) + dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + ret = its_alloc_vcpu_irqs(&dist->its_vm); + if (ret < 0) { + kvm_err("VPE IRQ allocation failure\n"); + kfree(dist->its_vm.vpes); + dist->its_vm.nr_vpes = 0; + dist->its_vm.vpes = NULL; + return ret; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + int irq = dist->its_vm.vpes[i]->irq; + unsigned long irq_flags = DB_IRQ_FLAGS; + + /* + * Don't automatically enable the doorbell, as we're + * flipping it back and forth when the vcpu gets + * blocked. Also disable the lazy disabling, as the + * doorbell could kick us out of the guest too + * early... + * + * On GICv4.1, the doorbell is managed in HW and must + * be left enabled. + */ + if (kvm_vgic_global_state.has_gicv4_1) + irq_flags &= ~IRQ_NOAUTOEN; + irq_set_status_flags(irq, irq_flags); + + ret = request_irq(irq, vgic_v4_doorbell_handler, + 0, "vcpu", vcpu); + if (ret) { + kvm_err("failed to allocate vcpu IRQ%d\n", irq); + /* + * Trick: adjust the number of vpes so we know + * how many to nuke on teardown... + */ + dist->its_vm.nr_vpes = i; + break; + } + } + + if (ret) + vgic_v4_teardown(kvm); + + return ret; +} + +/** + * vgic_v4_teardown - Free the GICv4 data structures + * @kvm: Pointer to the VM being destroyed + * + * Relies on kvm->lock to be held. + */ +void vgic_v4_teardown(struct kvm *kvm) +{ + struct its_vm *its_vm = &kvm->arch.vgic.its_vm; + int i; + + if (!its_vm->vpes) + return; + + for (i = 0; i < its_vm->nr_vpes; i++) { + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i); + int irq = its_vm->vpes[i]->irq; + + irq_clear_status_flags(irq, DB_IRQ_FLAGS); + free_irq(irq, vcpu); + } + + its_free_vcpu_irqs(its_vm); + kfree(its_vm->vpes); + its_vm->nr_vpes = 0; + its_vm->vpes = NULL; +} + +int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident) + return 0; + + return its_make_vpe_non_resident(vpe, need_db); +} + +int vgic_v4_load(struct kvm_vcpu *vcpu) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + int err; + + if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident) + return 0; + + /* + * Before making the VPE resident, make sure the redistributor + * corresponding to our current CPU expects us here. See the + * doc in drivers/irqchip/irq-gic-v4.c to understand how this + * turns into a VMOVP command at the ITS level. + */ + err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id())); + if (err) + return err; + + err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled); + if (err) + return err; + + /* + * Now that the VPE is resident, let's get rid of a potential + * doorbell interrupt that would still be pending. This is a + * GICv4.0 only "feature"... + */ + if (!kvm_vgic_global_state.has_gicv4_1) + err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false); + + return err; +} + +static struct vgic_its *vgic_get_its(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct kvm_msi msi = (struct kvm_msi) { + .address_lo = irq_entry->msi.address_lo, + .address_hi = irq_entry->msi.address_hi, + .data = irq_entry->msi.data, + .flags = irq_entry->msi.flags, + .devid = irq_entry->msi.devid, + }; + + return vgic_msi_to_its(kvm, &msi); +} + +int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct vgic_its *its; + struct vgic_irq *irq; + struct its_vlpi_map map; + int ret; + + if (!vgic_supports_direct_msis(kvm)) + return 0; + + /* + * Get the ITS, and escape early on error (not a valid + * doorbell for any of our vITSs). + */ + its = vgic_get_its(kvm, irq_entry); + if (IS_ERR(its)) + return 0; + + mutex_lock(&its->its_lock); + + /* Perform the actual DevID/EventID -> LPI translation. */ + ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq); + if (ret) + goto out; + + /* + * Emit the mapping request. If it fails, the ITS probably + * isn't v4 compatible, so let's silently bail out. Holding + * the ITS lock should ensure that nothing can modify the + * target vcpu. + */ + map = (struct its_vlpi_map) { + .vm = &kvm->arch.vgic.its_vm, + .vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe, + .vintid = irq->intid, + .properties = ((irq->priority & 0xfc) | + (irq->enabled ? LPI_PROP_ENABLED : 0) | + LPI_PROP_GROUP1), + .db_enabled = true, + }; + + ret = its_map_vlpi(virq, &map); + if (ret) + goto out; + + irq->hw = true; + irq->host_irq = virq; + atomic_inc(&map.vpe->vlpi_count); + +out: + mutex_unlock(&its->its_lock); + return ret; +} + +int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct vgic_its *its; + struct vgic_irq *irq; + int ret; + + if (!vgic_supports_direct_msis(kvm)) + return 0; + + /* + * Get the ITS, and escape early on error (not a valid + * doorbell for any of our vITSs). + */ + its = vgic_get_its(kvm, irq_entry); + if (IS_ERR(its)) + return 0; + + mutex_lock(&its->its_lock); + + ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq); + if (ret) + goto out; + + WARN_ON(!(irq->hw && irq->host_irq == virq)); + if (irq->hw) { + atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); + irq->hw = false; + ret = its_unmap_vlpi(virq); + } + +out: + mutex_unlock(&its->its_lock); + return ret; +} diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c new file mode 100644 index 000000000000..99b02ca730a8 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic.c @@ -0,0 +1,1011 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "vgic.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +struct vgic_global kvm_vgic_global_state __ro_after_init = { + .gicv3_cpuif = STATIC_KEY_FALSE_INIT, +}; + +/* + * Locking order is always: + * kvm->lock (mutex) + * its->cmd_lock (mutex) + * its->its_lock (mutex) + * vgic_cpu->ap_list_lock must be taken with IRQs disabled + * kvm->lpi_list_lock must be taken with IRQs disabled + * vgic_irq->irq_lock must be taken with IRQs disabled + * + * As the ap_list_lock might be taken from the timer interrupt handler, + * we have to disable IRQs before taking this lock and everything lower + * than it. + * + * If you need to take multiple locks, always take the upper lock first, + * then the lower ones, e.g. first take the its_lock, then the irq_lock. + * If you are already holding a lock and need to take a higher one, you + * have to drop the lower ranking lock first and re-aquire it after having + * taken the upper one. + * + * When taking more than one ap_list_lock at the same time, always take the + * lowest numbered VCPU's ap_list_lock first, so: + * vcpuX->vcpu_id < vcpuY->vcpu_id: + * raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); + * raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); + * + * Since the VGIC must support injecting virtual interrupts from ISRs, we have + * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer + * spinlocks for any lock that may be taken while injecting an interrupt. + */ + +/* + * Iterate over the VM's list of mapped LPIs to find the one with a + * matching interrupt ID and return a reference to the IRQ structure. + */ +static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + if (irq->intid != intid) + continue; + + /* + * This increases the refcount, the caller is expected to + * call vgic_put_irq() later once it's finished with the IRQ. + */ + vgic_get_irq_kref(irq); + goto out_unlock; + } + irq = NULL; + +out_unlock: + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + + return irq; +} + +/* + * This looks up the virtual interrupt ID to get the corresponding + * struct vgic_irq. It also increases the refcount, so any caller is expected + * to call vgic_put_irq() once it's finished with this IRQ. + */ +struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, + u32 intid) +{ + /* SGIs and PPIs */ + if (intid <= VGIC_MAX_PRIVATE) { + intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); + return &vcpu->arch.vgic_cpu.private_irqs[intid]; + } + + /* SPIs */ + if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { + intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); + return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; + } + + /* LPIs */ + if (intid >= VGIC_MIN_LPI) + return vgic_get_lpi(kvm, intid); + + WARN(1, "Looking up struct vgic_irq for reserved INTID"); + return NULL; +} + +/* + * We can't do anything in here, because we lack the kvm pointer to + * lock and remove the item from the lpi_list. So we keep this function + * empty and use the return value of kref_put() to trigger the freeing. + */ +static void vgic_irq_release(struct kref *ref) +{ +} + +/* + * Drop the refcount on the LPI. Must be called with lpi_list_lock held. + */ +void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + if (!kref_put(&irq->refcount, vgic_irq_release)) + return; + + list_del(&irq->lpi_list); + dist->lpi_list_count--; + + kfree(irq); +} + +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long flags; + + if (irq->intid < VGIC_MIN_LPI) + return; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + __vgic_put_lpi_locked(kvm, irq); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); +} + +void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq, *tmp; + unsigned long flags; + + raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); + + list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { + if (irq->intid >= VGIC_MIN_LPI) { + raw_spin_lock(&irq->irq_lock); + list_del(&irq->ap_list); + irq->vcpu = NULL; + raw_spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); + } + } + + raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); +} + +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) +{ + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + pending)); +} + +bool vgic_get_phys_line_level(struct vgic_irq *irq) +{ + bool line_level; + + BUG_ON(!irq->hw); + + if (irq->get_input_level) + return irq->get_input_level(irq->intid); + + WARN_ON(irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &line_level)); + return line_level; +} + +/* Set/Clear the physical active state */ +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) +{ + + BUG_ON(!irq->hw); + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_ACTIVE, + active)); +} + +/** + * kvm_vgic_target_oracle - compute the target vcpu for an irq + * + * @irq: The irq to route. Must be already locked. + * + * Based on the current state of the interrupt (enabled, pending, + * active, vcpu and target_vcpu), compute the next vcpu this should be + * given to. Return NULL if this shouldn't be injected at all. + * + * Requires the IRQ lock to be held. + */ +static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) +{ + lockdep_assert_held(&irq->irq_lock); + + /* If the interrupt is active, it must stay on the current vcpu */ + if (irq->active) + return irq->vcpu ? : irq->target_vcpu; + + /* + * If the IRQ is not active but enabled and pending, we should direct + * it to its configured target VCPU. + * If the distributor is disabled, pending interrupts shouldn't be + * forwarded. + */ + if (irq->enabled && irq_is_pending(irq)) { + if (unlikely(irq->target_vcpu && + !irq->target_vcpu->kvm->arch.vgic.enabled)) + return NULL; + + return irq->target_vcpu; + } + + /* If neither active nor pending and enabled, then this IRQ should not + * be queued to any VCPU. + */ + return NULL; +} + +/* + * The order of items in the ap_lists defines how we'll pack things in LRs as + * well, the first items in the list being the first things populated in the + * LRs. + * + * A hard rule is that active interrupts can never be pushed out of the LRs + * (and therefore take priority) since we cannot reliably trap on deactivation + * of IRQs and therefore they have to be present in the LRs. + * + * Otherwise things should be sorted by the priority field and the GIC + * hardware support will take care of preemption of priority groups etc. + * + * Return negative if "a" sorts before "b", 0 to preserve order, and positive + * to sort "b" before "a". + */ +static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); + struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); + bool penda, pendb; + int ret; + + /* + * list_sort may call this function with the same element when + * the list is fairly long. + */ + if (unlikely(irqa == irqb)) + return 0; + + raw_spin_lock(&irqa->irq_lock); + raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); + + if (irqa->active || irqb->active) { + ret = (int)irqb->active - (int)irqa->active; + goto out; + } + + penda = irqa->enabled && irq_is_pending(irqa); + pendb = irqb->enabled && irq_is_pending(irqb); + + if (!penda || !pendb) { + ret = (int)pendb - (int)penda; + goto out; + } + + /* Both pending and enabled, sort by priority */ + ret = irqa->priority - irqb->priority; +out: + raw_spin_unlock(&irqb->irq_lock); + raw_spin_unlock(&irqa->irq_lock); + return ret; +} + +/* Must be called with the ap_list_lock held */ +static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + lockdep_assert_held(&vgic_cpu->ap_list_lock); + + list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); +} + +/* + * Only valid injection if changing level for level-triggered IRQs or for a + * rising edge, and in-kernel connected IRQ lines can only be controlled by + * their owner. + */ +static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner) +{ + if (irq->owner != owner) + return false; + + switch (irq->config) { + case VGIC_CONFIG_LEVEL: + return irq->line_level != level; + case VGIC_CONFIG_EDGE: + return level; + } + + return false; +} + +/* + * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. + * Do the queuing if necessary, taking the right locks in the right order. + * Returns true when the IRQ was queued, false otherwise. + * + * Needs to be entered with the IRQ lock already held, but will return + * with all locks dropped. + */ +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags) +{ + struct kvm_vcpu *vcpu; + + lockdep_assert_held(&irq->irq_lock); + +retry: + vcpu = vgic_target_oracle(irq); + if (irq->vcpu || !vcpu) { + /* + * If this IRQ is already on a VCPU's ap_list, then it + * cannot be moved or modified and there is no more work for + * us to do. + * + * Otherwise, if the irq is not pending and enabled, it does + * not need to be inserted into an ap_list and there is also + * no more work for us to do. + */ + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + /* + * We have to kick the VCPU here, because we could be + * queueing an edge-triggered interrupt for which we + * get no EOI maintenance interrupt. In that case, + * while the IRQ is already on the VCPU's AP list, the + * VCPU could have EOI'ed the original interrupt and + * won't see this one until it exits for some other + * reason. + */ + if (vcpu) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } + return false; + } + + /* + * We must unlock the irq lock to take the ap_list_lock where + * we are going to insert this new pending interrupt. + */ + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + /* someone can do stuff here, which we re-check below */ + + raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags); + raw_spin_lock(&irq->irq_lock); + + /* + * Did something change behind our backs? + * + * There are two cases: + * 1) The irq lost its pending state or was disabled behind our + * backs and/or it was queued to another VCPU's ap_list. + * 2) Someone changed the affinity on this irq behind our + * backs and we are now holding the wrong ap_list_lock. + * + * In both cases, drop the locks and retry. + */ + + if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, + flags); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + goto retry; + } + + /* + * Grab a reference to the irq to reflect the fact that it is + * now in the ap_list. + */ + vgic_get_irq_kref(irq); + list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); + irq->vcpu = vcpu; + + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); + + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return true; +} + +/** + * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic + * @kvm: The VM structure pointer + * @cpuid: The CPU for PPIs + * @intid: The INTID to inject a new state to. + * @level: Edge-triggered: true: to trigger the interrupt + * false: to ignore the call + * Level-sensitive true: raise the input signal + * false: lower the input signal + * @owner: The opaque pointer to the owner of the IRQ being raised to verify + * that the caller is allowed to inject this IRQ. Userspace + * injections will have owner == NULL. + * + * The VGIC is not concerned with devices being active-LOW or active-HIGH for + * level-sensitive interrupts. You can think of the level parameter as 1 + * being HIGH and 0 being LOW and all devices being active-HIGH. + */ +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, + bool level, void *owner) +{ + struct kvm_vcpu *vcpu; + struct vgic_irq *irq; + unsigned long flags; + int ret; + + trace_vgic_update_irq_pending(cpuid, intid, level); + + ret = vgic_lazy_init(kvm); + if (ret) + return ret; + + vcpu = kvm_get_vcpu(kvm, cpuid); + if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + irq = vgic_get_irq(kvm, vcpu, intid); + if (!irq) + return -EINVAL; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (!vgic_validate_injection(irq, level, owner)) { + /* Nothing to see here, move along... */ + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(kvm, irq); + return 0; + } + + if (irq->config == VGIC_CONFIG_LEVEL) + irq->line_level = level; + else + irq->pending_latch = true; + + vgic_queue_irq_unlock(kvm, irq, flags); + vgic_put_irq(kvm, irq); + + return 0; +} + +/* @irq->irq_lock must be held */ +static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + unsigned int host_irq, + bool (*get_input_level)(int vindid)) +{ + struct irq_desc *desc; + struct irq_data *data; + + /* + * Find the physical IRQ number corresponding to @host_irq + */ + desc = irq_to_desc(host_irq); + if (!desc) { + kvm_err("%s: no interrupt descriptor\n", __func__); + return -EINVAL; + } + data = irq_desc_get_irq_data(desc); + while (data->parent_data) + data = data->parent_data; + + irq->hw = true; + irq->host_irq = host_irq; + irq->hwintid = data->hwirq; + irq->get_input_level = get_input_level; + return 0; +} + +/* @irq->irq_lock must be held */ +static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) +{ + irq->hw = false; + irq->hwintid = 0; + irq->get_input_level = NULL; +} + +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, + u32 vintid, bool (*get_input_level)(int vindid)) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + unsigned long flags; + int ret; + + BUG_ON(!irq); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + return ret; +} + +/** + * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ + * @vcpu: The VCPU pointer + * @vintid: The INTID of the interrupt + * + * Reset the active and pending states of a mapped interrupt. Kernel + * subsystems injecting mapped interrupts should reset their interrupt lines + * when we are doing a reset of the VM. + */ +void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + unsigned long flags; + + if (!irq->hw) + goto out; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->active = false; + irq->pending_latch = false; + irq->line_level = false; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); +out: + vgic_put_irq(vcpu->kvm, irq); +} + +int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) +{ + struct vgic_irq *irq; + unsigned long flags; + + if (!vgic_initialized(vcpu->kvm)) + return -EAGAIN; + + irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + BUG_ON(!irq); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + kvm_vgic_unmap_irq(irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + return 0; +} + +/** + * kvm_vgic_set_owner - Set the owner of an interrupt for a VM + * + * @vcpu: Pointer to the VCPU (used for PPIs) + * @intid: The virtual INTID identifying the interrupt (PPI or SPI) + * @owner: Opaque pointer to the owner + * + * Returns 0 if intid is not already used by another in-kernel device and the + * owner is set, otherwise returns an error code. + */ +int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) +{ + struct vgic_irq *irq; + unsigned long flags; + int ret = 0; + + if (!vgic_initialized(vcpu->kvm)) + return -EAGAIN; + + /* SGIs and LPIs cannot be wired up to any device */ + if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid)) + return -EINVAL; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->owner && irq->owner != owner) + ret = -EEXIST; + else + irq->owner = owner; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + return ret; +} + +/** + * vgic_prune_ap_list - Remove non-relevant interrupts from the list + * + * @vcpu: The VCPU pointer + * + * Go over the list of "interesting" interrupts, and prune those that we + * won't have to consider in the near future. + */ +static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq, *tmp; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + +retry: + raw_spin_lock(&vgic_cpu->ap_list_lock); + + list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { + struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; + bool target_vcpu_needs_kick = false; + + raw_spin_lock(&irq->irq_lock); + + BUG_ON(vcpu != irq->vcpu); + + target_vcpu = vgic_target_oracle(irq); + + if (!target_vcpu) { + /* + * We don't need to process this interrupt any + * further, move it off the list. + */ + list_del(&irq->ap_list); + irq->vcpu = NULL; + raw_spin_unlock(&irq->irq_lock); + + /* + * This vgic_put_irq call matches the + * vgic_get_irq_kref in vgic_queue_irq_unlock, + * where we added the LPI to the ap_list. As + * we remove the irq from the list, we drop + * also drop the refcount. + */ + vgic_put_irq(vcpu->kvm, irq); + continue; + } + + if (target_vcpu == vcpu) { + /* We're on the right CPU */ + raw_spin_unlock(&irq->irq_lock); + continue; + } + + /* This interrupt looks like it has to be migrated. */ + + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&vgic_cpu->ap_list_lock); + + /* + * Ensure locking order by always locking the smallest + * ID first. + */ + if (vcpu->vcpu_id < target_vcpu->vcpu_id) { + vcpuA = vcpu; + vcpuB = target_vcpu; + } else { + vcpuA = target_vcpu; + vcpuB = vcpu; + } + + raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); + raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, + SINGLE_DEPTH_NESTING); + raw_spin_lock(&irq->irq_lock); + + /* + * If the affinity has been preserved, move the + * interrupt around. Otherwise, it means things have + * changed while the interrupt was unlocked, and we + * need to replay this. + * + * In all cases, we cannot trust the list not to have + * changed, so we restart from the beginning. + */ + if (target_vcpu == vgic_target_oracle(irq)) { + struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu; + + list_del(&irq->ap_list); + irq->vcpu = target_vcpu; + list_add_tail(&irq->ap_list, &new_cpu->ap_list_head); + target_vcpu_needs_kick = true; + } + + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); + raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); + + if (target_vcpu_needs_kick) { + kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu); + kvm_vcpu_kick(target_vcpu); + } + + goto retry; + } + + raw_spin_unlock(&vgic_cpu->ap_list_lock); +} + +static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_fold_lr_state(vcpu); + else + vgic_v3_fold_lr_state(vcpu); +} + +/* Requires the irq_lock to be held. */ +static inline void vgic_populate_lr(struct kvm_vcpu *vcpu, + struct vgic_irq *irq, int lr) +{ + lockdep_assert_held(&irq->irq_lock); + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_populate_lr(vcpu, irq, lr); + else + vgic_v3_populate_lr(vcpu, irq, lr); +} + +static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_clear_lr(vcpu, lr); + else + vgic_v3_clear_lr(vcpu, lr); +} + +static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_underflow(vcpu); + else + vgic_v3_set_underflow(vcpu); +} + +/* Requires the ap_list_lock to be held. */ +static int compute_ap_list_depth(struct kvm_vcpu *vcpu, + bool *multi_sgi) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + int count = 0; + + *multi_sgi = false; + + lockdep_assert_held(&vgic_cpu->ap_list_lock); + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + int w; + + raw_spin_lock(&irq->irq_lock); + /* GICv2 SGIs can count for more than one... */ + w = vgic_irq_get_lr_count(irq); + raw_spin_unlock(&irq->irq_lock); + + count += w; + *multi_sgi |= (w > 1); + } + return count; +} + +/* Requires the VCPU's ap_list_lock to be held. */ +static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + int count; + bool multi_sgi; + u8 prio = 0xff; + + lockdep_assert_held(&vgic_cpu->ap_list_lock); + + count = compute_ap_list_depth(vcpu, &multi_sgi); + if (count > kvm_vgic_global_state.nr_lr || multi_sgi) + vgic_sort_ap_list(vcpu); + + count = 0; + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + raw_spin_lock(&irq->irq_lock); + + /* + * If we have multi-SGIs in the pipeline, we need to + * guarantee that they are all seen before any IRQ of + * lower priority. In that case, we need to filter out + * these interrupts by exiting early. This is easy as + * the AP list has been sorted already. + */ + if (multi_sgi && irq->priority > prio) { + _raw_spin_unlock(&irq->irq_lock); + break; + } + + if (likely(vgic_target_oracle(irq) == vcpu)) { + vgic_populate_lr(vcpu, irq, count++); + + if (irq->source) + prio = irq->priority; + } + + raw_spin_unlock(&irq->irq_lock); + + if (count == kvm_vgic_global_state.nr_lr) { + if (!list_is_last(&irq->ap_list, + &vgic_cpu->ap_list_head)) + vgic_set_underflow(vcpu); + break; + } + } + + vcpu->arch.vgic_cpu.used_lrs = count; + + /* Nuke remaining LRs */ + for ( ; count < kvm_vgic_global_state.nr_lr; count++) + vgic_clear_lr(vcpu, count); +} + +static inline bool can_access_vgic_from_kernel(void) +{ + /* + * GICv2 can always be accessed from the kernel because it is + * memory-mapped, and VHE systems can access GICv3 EL2 system + * registers. + */ + return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe(); +} + +static inline void vgic_save_state(struct kvm_vcpu *vcpu) +{ + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v2_save_state(vcpu); + else + __vgic_v3_save_state(vcpu); +} + +/* Sync back the hardware VGIC state into our emulation after a guest's run. */ +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + /* An empty ap_list_head implies used_lrs == 0 */ + if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) + return; + + if (can_access_vgic_from_kernel()) + vgic_save_state(vcpu); + + if (vgic_cpu->used_lrs) + vgic_fold_lr_state(vcpu); + vgic_prune_ap_list(vcpu); +} + +static inline void vgic_restore_state(struct kvm_vcpu *vcpu) +{ + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v2_restore_state(vcpu); + else + __vgic_v3_restore_state(vcpu); +} + +/* Flush our emulation state into the GIC hardware before entering the guest. */ +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) +{ + /* + * If there are no virtual interrupts active or pending for this + * VCPU, then there is no work to do and we can bail out without + * taking any lock. There is a potential race with someone injecting + * interrupts to the VCPU, but it is a benign race as the VCPU will + * either observe the new interrupt before or after doing this check, + * and introducing additional synchronization mechanism doesn't change + * this. + * + * Note that we still need to go through the whole thing if anything + * can be directly injected (GICv4). + */ + if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && + !vgic_supports_direct_msis(vcpu->kvm)) + return; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { + raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + vgic_flush_lr_state(vcpu); + raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); + } + + if (can_access_vgic_from_kernel()) + vgic_restore_state(vcpu); +} + +void kvm_vgic_load(struct kvm_vcpu *vcpu) +{ + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_load(vcpu); + else + vgic_v3_load(vcpu); +} + +void kvm_vgic_put(struct kvm_vcpu *vcpu) +{ + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_put(vcpu); + else + vgic_v3_put(vcpu); +} + +void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu) +{ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_vmcr_sync(vcpu); + else + vgic_v3_vmcr_sync(vcpu); +} + +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + bool pending = false; + unsigned long flags; + struct vgic_vmcr vmcr; + + if (!vcpu->kvm->arch.vgic.enabled) + return false; + + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) + return true; + + vgic_get_vmcr(vcpu, &vmcr); + + raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + raw_spin_lock(&irq->irq_lock); + pending = irq_is_pending(irq) && irq->enabled && + !irq->active && + irq->priority < vmcr.pmr; + raw_spin_unlock(&irq->irq_lock); + + if (pending) + break; + } + + raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + + return pending; +} + +void vgic_kick_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int c; + + /* + * We've injected an interrupt, time to find out who deserves + * a good kick... + */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (kvm_vgic_vcpu_pending_irq(vcpu)) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } + } +} + +bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) +{ + struct vgic_irq *irq; + bool map_is_active; + unsigned long flags; + + if (!vgic_initialized(vcpu->kvm)) + return false; + + irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + map_is_active = irq->hw && irq->active; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + return map_is_active; +} diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h new file mode 100644 index 000000000000..769e4802645e --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic.h @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ +#ifndef __KVM_ARM_VGIC_NEW_H__ +#define __KVM_ARM_VGIC_NEW_H__ + +#include + +#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ +#define IMPLEMENTER_ARM 0x43b + +#define VGIC_ADDR_UNDEF (-1) +#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) + +#define INTERRUPT_ID_BITS_SPIS 10 +#define INTERRUPT_ID_BITS_ITS 16 +#define VGIC_PRI_BITS 5 + +#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) + +#define VGIC_AFFINITY_0_SHIFT 0 +#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT) +#define VGIC_AFFINITY_1_SHIFT 8 +#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT) +#define VGIC_AFFINITY_2_SHIFT 16 +#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT) +#define VGIC_AFFINITY_3_SHIFT 24 +#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT) + +#define VGIC_AFFINITY_LEVEL(reg, level) \ + ((((reg) & VGIC_AFFINITY_## level ##_MASK) \ + >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) + +/* + * The Userspace encodes the affinity differently from the MPIDR, + * Below macro converts vgic userspace format to MPIDR reg format. + */ +#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \ + VGIC_AFFINITY_LEVEL(val, 1) | \ + VGIC_AFFINITY_LEVEL(val, 2) | \ + VGIC_AFFINITY_LEVEL(val, 3)) + +/* + * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt, + * below macros are defined for CPUREG encoding. + */ +#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 +#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14 +#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800 +#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11 +#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780 +#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7 +#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078 +#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3 +#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007 +#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0 + +#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) + +/* + * As per Documentation/virt/kvm/devices/arm-vgic-its.txt, + * below macros are defined for ITS table entry encoding. + */ +#define KVM_ITS_CTE_VALID_SHIFT 63 +#define KVM_ITS_CTE_VALID_MASK BIT_ULL(63) +#define KVM_ITS_CTE_RDBASE_SHIFT 16 +#define KVM_ITS_CTE_ICID_MASK GENMASK_ULL(15, 0) +#define KVM_ITS_ITE_NEXT_SHIFT 48 +#define KVM_ITS_ITE_PINTID_SHIFT 16 +#define KVM_ITS_ITE_PINTID_MASK GENMASK_ULL(47, 16) +#define KVM_ITS_ITE_ICID_MASK GENMASK_ULL(15, 0) +#define KVM_ITS_DTE_VALID_SHIFT 63 +#define KVM_ITS_DTE_VALID_MASK BIT_ULL(63) +#define KVM_ITS_DTE_NEXT_SHIFT 49 +#define KVM_ITS_DTE_NEXT_MASK GENMASK_ULL(62, 49) +#define KVM_ITS_DTE_ITTADDR_SHIFT 5 +#define KVM_ITS_DTE_ITTADDR_MASK GENMASK_ULL(48, 5) +#define KVM_ITS_DTE_SIZE_MASK GENMASK_ULL(4, 0) +#define KVM_ITS_L1E_VALID_MASK BIT_ULL(63) +/* we only support 64 kB translation table page size */ +#define KVM_ITS_L1E_ADDR_MASK GENMASK_ULL(51, 16) + +#define KVM_VGIC_V3_RDIST_INDEX_MASK GENMASK_ULL(11, 0) +#define KVM_VGIC_V3_RDIST_FLAGS_MASK GENMASK_ULL(15, 12) +#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT 12 +#define KVM_VGIC_V3_RDIST_BASE_MASK GENMASK_ULL(51, 16) +#define KVM_VGIC_V3_RDIST_COUNT_MASK GENMASK_ULL(63, 52) +#define KVM_VGIC_V3_RDIST_COUNT_SHIFT 52 + +#ifdef CONFIG_DEBUG_SPINLOCK +#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) +#else +#define DEBUG_SPINLOCK_BUG_ON(p) +#endif + +/* Requires the irq_lock to be held by the caller. */ +static inline bool irq_is_pending(struct vgic_irq *irq) +{ + if (irq->config == VGIC_CONFIG_EDGE) + return irq->pending_latch; + else + return irq->pending_latch || irq->line_level; +} + +static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq) +{ + return irq->config == VGIC_CONFIG_LEVEL && irq->hw; +} + +static inline int vgic_irq_get_lr_count(struct vgic_irq *irq) +{ + /* Account for the active state as an interrupt */ + if (vgic_irq_is_sgi(irq->intid) && irq->source) + return hweight8(irq->source) + irq->active; + + return irq_is_pending(irq) || irq->active; +} + +static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) +{ + return vgic_irq_get_lr_count(irq) > 1; +} + +/* + * This struct provides an intermediate representation of the fields contained + * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC + * state to userspace can generate either GICv2 or GICv3 CPU interface + * registers regardless of the hardware backed GIC used. + */ +struct vgic_vmcr { + u32 grpen0; + u32 grpen1; + + u32 ackctl; + u32 fiqen; + u32 cbpr; + u32 eoim; + + u32 abpr; + u32 bpr; + u32 pmr; /* Priority mask field in the GICC_PMR and + * ICC_PMR_EL1 priority field format */ +}; + +struct vgic_reg_attr { + struct kvm_vcpu *vcpu; + gpa_t addr; +}; + +int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr); +int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr); +const struct vgic_register_region * +vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, int len); +struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, + u32 intid); +void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq); +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); +bool vgic_get_phys_line_level(struct vgic_irq *irq); +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags); +void vgic_kick_vcpus(struct kvm *kvm); + +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment); + +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v2_set_npie(struct kvm_vcpu *vcpu); +int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); +int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v2_enable(struct kvm_vcpu *vcpu); +int vgic_v2_probe(const struct gic_kvm_info *info); +int vgic_v2_map_resources(struct kvm *kvm); +int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, + enum vgic_type); + +void vgic_v2_init_lrs(void); +void vgic_v2_load(struct kvm_vcpu *vcpu); +void vgic_v2_put(struct kvm_vcpu *vcpu); +void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); + +void vgic_v2_save_state(struct kvm_vcpu *vcpu); +void vgic_v2_restore_state(struct kvm_vcpu *vcpu); + +static inline void vgic_get_irq_kref(struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return; + + kref_get(&irq->refcount); +} + +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v3_set_npie(struct kvm_vcpu *vcpu); +void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v3_enable(struct kvm_vcpu *vcpu); +int vgic_v3_probe(const struct gic_kvm_info *info); +int vgic_v3_map_resources(struct kvm *kvm); +int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); +int vgic_v3_save_pending_tables(struct kvm *kvm); +int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count); +int vgic_register_redist_iodev(struct kvm_vcpu *vcpu); +bool vgic_v3_check_base(struct kvm *kvm); + +void vgic_v3_load(struct kvm_vcpu *vcpu); +void vgic_v3_put(struct kvm_vcpu *vcpu); +void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu); + +bool vgic_has_its(struct kvm *kvm); +int kvm_vgic_register_its_device(void); +void vgic_enable_lpis(struct kvm_vcpu *vcpu); +void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); +int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); +int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u64 id, u64 *val); +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, + u64 *reg); +int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u32 intid, u64 *val); +int kvm_register_vgic_device(unsigned long type); +void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +int vgic_lazy_init(struct kvm *kvm); +int vgic_init(struct kvm *kvm); + +void vgic_debug_init(struct kvm *kvm); +void vgic_debug_destroy(struct kvm *kvm); + +bool lock_all_vcpus(struct kvm *kvm); +void unlock_all_vcpus(struct kvm *kvm); + +static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; + + /* + * num_pri_bits are initialized with HW supported values. + * We can rely safely on num_pri_bits even if VM has not + * restored ICC_CTLR_EL1 before restoring APnR registers. + */ + switch (cpu_if->num_pri_bits) { + case 7: return 3; + case 6: return 1; + default: return 0; + } +} + +static inline bool +vgic_v3_redist_region_full(struct vgic_redist_region *region) +{ + if (!region->count) + return false; + + return (region->free_index >= region->count); +} + +struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs); + +static inline size_t +vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) +{ + if (!rdreg->count) + return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE; + else + return rdreg->count * KVM_VGIC_V3_REDIST_SIZE; +} + +struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, + u32 index); + +bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); + +static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) +{ + struct vgic_dist *d = &kvm->arch.vgic; + + return (base + size > d->vgic_dist_base) && + (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); +} + +int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr); +int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, struct vgic_irq **irq); +struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); +int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi); +void vgic_lpi_translation_cache_init(struct kvm *kvm); +void vgic_lpi_translation_cache_destroy(struct kvm *kvm); +void vgic_its_invalidate_cache(struct kvm *kvm); + +bool vgic_supports_direct_msis(struct kvm *kvm); +int vgic_v4_init(struct kvm *kvm); +void vgic_v4_teardown(struct kvm *kvm); +void vgic_v4_configure_vsgis(struct kvm *kvm); + +#endif diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c deleted file mode 100644 index 0a356aa91aa1..000000000000 --- a/virt/kvm/arm/aarch32.c +++ /dev/null @@ -1,204 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * (not much of an) Emulation layer for 32bit guests. - * - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier - * - * based on arch/arm/kvm/emulate.c - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#include -#include -#include -#include - -#define DFSR_FSC_EXTABT_LPAE 0x10 -#define DFSR_FSC_EXTABT_nLPAE 0x08 -#define DFSR_LPAE BIT(9) - -/* - * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. - */ -static const u8 return_offsets[8][2] = { - [0] = { 0, 0 }, /* Reset, unused */ - [1] = { 4, 2 }, /* Undefined */ - [2] = { 0, 0 }, /* SVC, unused */ - [3] = { 4, 4 }, /* Prefetch abort */ - [4] = { 8, 8 }, /* Data abort */ - [5] = { 0, 0 }, /* HVC, unused */ - [6] = { 4, 4 }, /* IRQ, unused */ - [7] = { 4, 4 }, /* FIQ, unused */ -}; - -/* - * When an exception is taken, most CPSR fields are left unchanged in the - * handler. However, some are explicitly overridden (e.g. M[4:0]). - * - * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with - * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was - * obsoleted by the ARMv7 virtualization extensions and is RES0. - * - * For the SPSR layout seen from AArch32, see: - * - ARM DDI 0406C.d, page B1-1148 - * - ARM DDI 0487E.a, page G8-6264 - * - * For the SPSR_ELx layout for AArch32 seen from AArch64, see: - * - ARM DDI 0487E.a, page C5-426 - * - * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from - * MSB to LSB. - */ -static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) -{ - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - unsigned long old, new; - - old = *vcpu_cpsr(vcpu); - new = 0; - - new |= (old & PSR_AA32_N_BIT); - new |= (old & PSR_AA32_Z_BIT); - new |= (old & PSR_AA32_C_BIT); - new |= (old & PSR_AA32_V_BIT); - new |= (old & PSR_AA32_Q_BIT); - - // CPSR.IT[7:0] are set to zero upon any exception - // See ARM DDI 0487E.a, section G1.12.3 - // See ARM DDI 0406C.d, section B1.8.3 - - new |= (old & PSR_AA32_DIT_BIT); - - // CPSR.SSBS is set to SCTLR.DSSBS upon any exception - // See ARM DDI 0487E.a, page G8-6244 - if (sctlr & BIT(31)) - new |= PSR_AA32_SSBS_BIT; - - // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 - // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented - // See ARM DDI 0487E.a, page G8-6246 - new |= (old & PSR_AA32_PAN_BIT); - if (!(sctlr & BIT(23))) - new |= PSR_AA32_PAN_BIT; - - // SS does not exist in AArch32, so ignore - - // CPSR.IL is set to zero upon any exception - // See ARM DDI 0487E.a, page G1-5527 - - new |= (old & PSR_AA32_GE_MASK); - - // CPSR.IT[7:0] are set to zero upon any exception - // See prior comment above - - // CPSR.E is set to SCTLR.EE upon any exception - // See ARM DDI 0487E.a, page G8-6245 - // See ARM DDI 0406C.d, page B4-1701 - if (sctlr & BIT(25)) - new |= PSR_AA32_E_BIT; - - // CPSR.A is unchanged upon an exception to Undefined, Supervisor - // CPSR.A is set upon an exception to other modes - // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 - // See ARM DDI 0406C.d, page B1-1182 - new |= (old & PSR_AA32_A_BIT); - if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) - new |= PSR_AA32_A_BIT; - - // CPSR.I is set upon any exception - // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 - // See ARM DDI 0406C.d, page B1-1182 - new |= PSR_AA32_I_BIT; - - // CPSR.F is set upon an exception to FIQ - // CPSR.F is unchanged upon an exception to other modes - // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 - // See ARM DDI 0406C.d, page B1-1182 - new |= (old & PSR_AA32_F_BIT); - if (mode == PSR_AA32_MODE_FIQ) - new |= PSR_AA32_F_BIT; - - // CPSR.T is set to SCTLR.TE upon any exception - // See ARM DDI 0487E.a, page G8-5514 - // See ARM DDI 0406C.d, page B1-1181 - if (sctlr & BIT(30)) - new |= PSR_AA32_T_BIT; - - new |= mode; - - return new; -} - -static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) -{ - unsigned long spsr = *vcpu_cpsr(vcpu); - bool is_thumb = (spsr & PSR_AA32_T_BIT); - u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - - *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); - - /* Note: These now point to the banked copies */ - vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr)); - *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; - - /* Branch to exception vector */ - if (sctlr & (1 << 13)) - vect_offset += 0xffff0000; - else /* always have security exceptions */ - vect_offset += vcpu_cp15(vcpu, c12_VBAR); - - *vcpu_pc(vcpu) = vect_offset; -} - -void kvm_inject_undef32(struct kvm_vcpu *vcpu) -{ - prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4); -} - -/* - * Modelled after TakeDataAbortException() and TakePrefetchAbortException - * pseudocode. - */ -static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, - unsigned long addr) -{ - u32 vect_offset; - u32 *far, *fsr; - bool is_lpae; - - if (is_pabt) { - vect_offset = 12; - far = &vcpu_cp15(vcpu, c6_IFAR); - fsr = &vcpu_cp15(vcpu, c5_IFSR); - } else { /* !iabt */ - vect_offset = 16; - far = &vcpu_cp15(vcpu, c6_DFAR); - fsr = &vcpu_cp15(vcpu, c5_DFSR); - } - - prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset); - - *far = addr; - - /* Give the guest an IMPLEMENTATION DEFINED exception */ - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); - if (is_lpae) { - *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; - } else { - /* no need to shuffle FS[4] into DFSR[10] as its 0 */ - *fsr = DFSR_FSC_EXTABT_nLPAE; - } -} - -void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr) -{ - inject_abt32(vcpu, false, addr); -} - -void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr) -{ - inject_abt32(vcpu, true, addr); -} diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c deleted file mode 100644 index 93bd59b46848..000000000000 --- a/virt/kvm/arm/arch_timer.c +++ /dev/null @@ -1,1180 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -#include "trace.h" - -static struct timecounter *timecounter; -static unsigned int host_vtimer_irq; -static unsigned int host_ptimer_irq; -static u32 host_vtimer_irq_flags; -static u32 host_ptimer_irq_flags; - -static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); - -static const struct kvm_irq_level default_ptimer_irq = { - .irq = 30, - .level = 1, -}; - -static const struct kvm_irq_level default_vtimer_irq = { - .irq = 27, - .level = 1, -}; - -static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); -static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, - struct arch_timer_context *timer_ctx); -static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); -static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, - struct arch_timer_context *timer, - enum kvm_arch_timer_regs treg, - u64 val); -static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, - struct arch_timer_context *timer, - enum kvm_arch_timer_regs treg); - -u64 kvm_phys_timer_read(void) -{ - return timecounter->cc->read(timecounter->cc); -} - -static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) -{ - if (has_vhe()) { - map->direct_vtimer = vcpu_vtimer(vcpu); - map->direct_ptimer = vcpu_ptimer(vcpu); - map->emul_ptimer = NULL; - } else { - map->direct_vtimer = vcpu_vtimer(vcpu); - map->direct_ptimer = NULL; - map->emul_ptimer = vcpu_ptimer(vcpu); - } - - trace_kvm_get_timer_map(vcpu->vcpu_id, map); -} - -static inline bool userspace_irqchip(struct kvm *kvm) -{ - return static_branch_unlikely(&userspace_irqchip_in_use) && - unlikely(!irqchip_in_kernel(kvm)); -} - -static void soft_timer_start(struct hrtimer *hrt, u64 ns) -{ - hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), - HRTIMER_MODE_ABS_HARD); -} - -static void soft_timer_cancel(struct hrtimer *hrt) -{ - hrtimer_cancel(hrt); -} - -static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) -{ - struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; - struct arch_timer_context *ctx; - struct timer_map map; - - /* - * We may see a timer interrupt after vcpu_put() has been called which - * sets the CPU's vcpu pointer to NULL, because even though the timer - * has been disabled in timer_save_state(), the hardware interrupt - * signal may not have been retired from the interrupt controller yet. - */ - if (!vcpu) - return IRQ_HANDLED; - - get_timer_map(vcpu, &map); - - if (irq == host_vtimer_irq) - ctx = map.direct_vtimer; - else - ctx = map.direct_ptimer; - - if (kvm_timer_should_fire(ctx)) - kvm_timer_update_irq(vcpu, true, ctx); - - if (userspace_irqchip(vcpu->kvm) && - !static_branch_unlikely(&has_gic_active_state)) - disable_percpu_irq(host_vtimer_irq); - - return IRQ_HANDLED; -} - -static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) -{ - u64 cval, now; - - cval = timer_ctx->cnt_cval; - now = kvm_phys_timer_read() - timer_ctx->cntvoff; - - if (now < cval) { - u64 ns; - - ns = cyclecounter_cyc2ns(timecounter->cc, - cval - now, - timecounter->mask, - &timecounter->frac); - return ns; - } - - return 0; -} - -static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) -{ - WARN_ON(timer_ctx && timer_ctx->loaded); - return timer_ctx && - !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && - (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); -} - -/* - * Returns the earliest expiration time in ns among guest timers. - * Note that it will return 0 if none of timers can fire. - */ -static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) -{ - u64 min_delta = ULLONG_MAX; - int i; - - for (i = 0; i < NR_KVM_TIMERS; i++) { - struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; - - WARN(ctx->loaded, "timer %d loaded\n", i); - if (kvm_timer_irq_can_fire(ctx)) - min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); - } - - /* If none of timers can fire, then return 0 */ - if (min_delta == ULLONG_MAX) - return 0; - - return min_delta; -} - -static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) -{ - struct arch_timer_cpu *timer; - struct kvm_vcpu *vcpu; - u64 ns; - - timer = container_of(hrt, struct arch_timer_cpu, bg_timer); - vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); - - /* - * Check that the timer has really expired from the guest's - * PoV (NTP on the host may have forced it to expire - * early). If we should have slept longer, restart it. - */ - ns = kvm_timer_earliest_exp(vcpu); - if (unlikely(ns)) { - hrtimer_forward_now(hrt, ns_to_ktime(ns)); - return HRTIMER_RESTART; - } - - kvm_vcpu_wake_up(vcpu); - return HRTIMER_NORESTART; -} - -static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) -{ - struct arch_timer_context *ctx; - struct kvm_vcpu *vcpu; - u64 ns; - - ctx = container_of(hrt, struct arch_timer_context, hrtimer); - vcpu = ctx->vcpu; - - trace_kvm_timer_hrtimer_expire(ctx); - - /* - * Check that the timer has really expired from the guest's - * PoV (NTP on the host may have forced it to expire - * early). If not ready, schedule for a later time. - */ - ns = kvm_timer_compute_delta(ctx); - if (unlikely(ns)) { - hrtimer_forward_now(hrt, ns_to_ktime(ns)); - return HRTIMER_RESTART; - } - - kvm_timer_update_irq(vcpu, true, ctx); - return HRTIMER_NORESTART; -} - -static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) -{ - enum kvm_arch_timers index; - u64 cval, now; - - if (!timer_ctx) - return false; - - index = arch_timer_ctx_index(timer_ctx); - - if (timer_ctx->loaded) { - u32 cnt_ctl = 0; - - switch (index) { - case TIMER_VTIMER: - cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); - break; - case TIMER_PTIMER: - cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); - break; - case NR_KVM_TIMERS: - /* GCC is braindead */ - cnt_ctl = 0; - break; - } - - return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && - (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && - !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); - } - - if (!kvm_timer_irq_can_fire(timer_ctx)) - return false; - - cval = timer_ctx->cnt_cval; - now = kvm_phys_timer_read() - timer_ctx->cntvoff; - - return cval <= now; -} - -bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) -{ - struct timer_map map; - - get_timer_map(vcpu, &map); - - return kvm_timer_should_fire(map.direct_vtimer) || - kvm_timer_should_fire(map.direct_ptimer) || - kvm_timer_should_fire(map.emul_ptimer); -} - -/* - * Reflect the timer output level into the kvm_run structure - */ -void kvm_timer_update_run(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct kvm_sync_regs *regs = &vcpu->run->s.regs; - - /* Populate the device bitmap with the timer states */ - regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | - KVM_ARM_DEV_EL1_PTIMER); - if (kvm_timer_should_fire(vtimer)) - regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; - if (kvm_timer_should_fire(ptimer)) - regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; -} - -static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, - struct arch_timer_context *timer_ctx) -{ - int ret; - - timer_ctx->irq.level = new_level; - trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, - timer_ctx->irq.level); - - if (!userspace_irqchip(vcpu->kvm)) { - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - timer_ctx->irq.irq, - timer_ctx->irq.level, - timer_ctx); - WARN_ON(ret); - } -} - -/* Only called for a fully emulated timer */ -static void timer_emulate(struct arch_timer_context *ctx) -{ - bool should_fire = kvm_timer_should_fire(ctx); - - trace_kvm_timer_emulate(ctx, should_fire); - - if (should_fire != ctx->irq.level) { - kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); - return; - } - - /* - * If the timer can fire now, we don't need to have a soft timer - * scheduled for the future. If the timer cannot fire at all, - * then we also don't need a soft timer. - */ - if (!kvm_timer_irq_can_fire(ctx)) { - soft_timer_cancel(&ctx->hrtimer); - return; - } - - soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); -} - -static void timer_save_state(struct arch_timer_context *ctx) -{ - struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); - enum kvm_arch_timers index = arch_timer_ctx_index(ctx); - unsigned long flags; - - if (!timer->enabled) - return; - - local_irq_save(flags); - - if (!ctx->loaded) - goto out; - - switch (index) { - case TIMER_VTIMER: - ctx->cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); - ctx->cnt_cval = read_sysreg_el0(SYS_CNTV_CVAL); - - /* Disable the timer */ - write_sysreg_el0(0, SYS_CNTV_CTL); - isb(); - - break; - case TIMER_PTIMER: - ctx->cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); - ctx->cnt_cval = read_sysreg_el0(SYS_CNTP_CVAL); - - /* Disable the timer */ - write_sysreg_el0(0, SYS_CNTP_CTL); - isb(); - - break; - case NR_KVM_TIMERS: - BUG(); - } - - trace_kvm_timer_save_state(ctx); - - ctx->loaded = false; -out: - local_irq_restore(flags); -} - -/* - * Schedule the background timer before calling kvm_vcpu_block, so that this - * thread is removed from its waitqueue and made runnable when there's a timer - * interrupt to handle. - */ -static void kvm_timer_blocking(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct timer_map map; - - get_timer_map(vcpu, &map); - - /* - * If no timers are capable of raising interrupts (disabled or - * masked), then there's no more work for us to do. - */ - if (!kvm_timer_irq_can_fire(map.direct_vtimer) && - !kvm_timer_irq_can_fire(map.direct_ptimer) && - !kvm_timer_irq_can_fire(map.emul_ptimer)) - return; - - /* - * At least one guest time will expire. Schedule a background timer. - * Set the earliest expiration time among the guest timers. - */ - soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); -} - -static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - - soft_timer_cancel(&timer->bg_timer); -} - -static void timer_restore_state(struct arch_timer_context *ctx) -{ - struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); - enum kvm_arch_timers index = arch_timer_ctx_index(ctx); - unsigned long flags; - - if (!timer->enabled) - return; - - local_irq_save(flags); - - if (ctx->loaded) - goto out; - - switch (index) { - case TIMER_VTIMER: - write_sysreg_el0(ctx->cnt_cval, SYS_CNTV_CVAL); - isb(); - write_sysreg_el0(ctx->cnt_ctl, SYS_CNTV_CTL); - break; - case TIMER_PTIMER: - write_sysreg_el0(ctx->cnt_cval, SYS_CNTP_CVAL); - isb(); - write_sysreg_el0(ctx->cnt_ctl, SYS_CNTP_CTL); - break; - case NR_KVM_TIMERS: - BUG(); - } - - trace_kvm_timer_restore_state(ctx); - - ctx->loaded = true; -out: - local_irq_restore(flags); -} - -static void set_cntvoff(u64 cntvoff) -{ - u32 low = lower_32_bits(cntvoff); - u32 high = upper_32_bits(cntvoff); - - /* - * Since kvm_call_hyp doesn't fully support the ARM PCS especially on - * 32-bit systems, but rather passes register by register shifted one - * place (we put the function address in r0/x0), we cannot simply pass - * a 64-bit value as an argument, but have to split the value in two - * 32-bit halves. - */ - kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); -} - -static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) -{ - int r; - r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active); - WARN_ON(r); -} - -static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) -{ - struct kvm_vcpu *vcpu = ctx->vcpu; - bool phys_active = false; - - /* - * Update the timer output so that it is likely to match the - * state we're about to restore. If the timer expires between - * this point and the register restoration, we'll take the - * interrupt anyway. - */ - kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); - - if (irqchip_in_kernel(vcpu->kvm)) - phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); - - phys_active |= ctx->irq.level; - - set_timer_irq_phys_active(ctx, phys_active); -} - -static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - - /* - * Update the timer output so that it is likely to match the - * state we're about to restore. If the timer expires between - * this point and the register restoration, we'll take the - * interrupt anyway. - */ - kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer); - - /* - * When using a userspace irqchip with the architected timers and a - * host interrupt controller that doesn't support an active state, we - * must still prevent continuously exiting from the guest, and - * therefore mask the physical interrupt by disabling it on the host - * interrupt controller when the virtual level is high, such that the - * guest can make forward progress. Once we detect the output level - * being de-asserted, we unmask the interrupt again so that we exit - * from the guest when the timer fires. - */ - if (vtimer->irq.level) - disable_percpu_irq(host_vtimer_irq); - else - enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); -} - -void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct timer_map map; - - if (unlikely(!timer->enabled)) - return; - - get_timer_map(vcpu, &map); - - if (static_branch_likely(&has_gic_active_state)) { - kvm_timer_vcpu_load_gic(map.direct_vtimer); - if (map.direct_ptimer) - kvm_timer_vcpu_load_gic(map.direct_ptimer); - } else { - kvm_timer_vcpu_load_nogic(vcpu); - } - - set_cntvoff(map.direct_vtimer->cntvoff); - - kvm_timer_unblocking(vcpu); - - timer_restore_state(map.direct_vtimer); - if (map.direct_ptimer) - timer_restore_state(map.direct_ptimer); - - if (map.emul_ptimer) - timer_emulate(map.emul_ptimer); -} - -bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct kvm_sync_regs *sregs = &vcpu->run->s.regs; - bool vlevel, plevel; - - if (likely(irqchip_in_kernel(vcpu->kvm))) - return false; - - vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; - plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; - - return kvm_timer_should_fire(vtimer) != vlevel || - kvm_timer_should_fire(ptimer) != plevel; -} - -void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct timer_map map; - - if (unlikely(!timer->enabled)) - return; - - get_timer_map(vcpu, &map); - - timer_save_state(map.direct_vtimer); - if (map.direct_ptimer) - timer_save_state(map.direct_ptimer); - - /* - * Cancel soft timer emulation, because the only case where we - * need it after a vcpu_put is in the context of a sleeping VCPU, and - * in that case we already factor in the deadline for the physical - * timer when scheduling the bg_timer. - * - * In any case, we re-schedule the hrtimer for the physical timer when - * coming back to the VCPU thread in kvm_timer_vcpu_load(). - */ - if (map.emul_ptimer) - soft_timer_cancel(&map.emul_ptimer->hrtimer); - - if (swait_active(kvm_arch_vcpu_wq(vcpu))) - kvm_timer_blocking(vcpu); - - /* - * The kernel may decide to run userspace after calling vcpu_put, so - * we reset cntvoff to 0 to ensure a consistent read between user - * accesses to the virtual counter and kernel access to the physical - * counter of non-VHE case. For VHE, the virtual counter uses a fixed - * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. - */ - set_cntvoff(0); -} - -/* - * With a userspace irqchip we have to check if the guest de-asserted the - * timer and if so, unmask the timer irq signal on the host interrupt - * controller to ensure that we see future timer signals. - */ -static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - - if (!kvm_timer_should_fire(vtimer)) { - kvm_timer_update_irq(vcpu, false, vtimer); - if (static_branch_likely(&has_gic_active_state)) - set_timer_irq_phys_active(vtimer, false); - else - enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); - } -} - -void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - - if (unlikely(!timer->enabled)) - return; - - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) - unmask_vtimer_irq_user(vcpu); -} - -int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct timer_map map; - - get_timer_map(vcpu, &map); - - /* - * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 - * and to 0 for ARMv7. We provide an implementation that always - * resets the timer to be disabled and unmasked and is compliant with - * the ARMv7 architecture. - */ - vcpu_vtimer(vcpu)->cnt_ctl = 0; - vcpu_ptimer(vcpu)->cnt_ctl = 0; - - if (timer->enabled) { - kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu)); - kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu)); - - if (irqchip_in_kernel(vcpu->kvm)) { - kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq); - if (map.direct_ptimer) - kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq); - } - } - - if (map.emul_ptimer) - soft_timer_cancel(&map.emul_ptimer->hrtimer); - - return 0; -} - -/* Make the updates of cntvoff for all vtimer contexts atomic */ -static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) -{ - int i; - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *tmp; - - mutex_lock(&kvm->lock); - kvm_for_each_vcpu(i, tmp, kvm) - vcpu_vtimer(tmp)->cntvoff = cntvoff; - - /* - * When called from the vcpu create path, the CPU being created is not - * included in the loop above, so we just set it here as well. - */ - vcpu_vtimer(vcpu)->cntvoff = cntvoff; - mutex_unlock(&kvm->lock); -} - -void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - /* Synchronize cntvoff across all vtimers of a VM. */ - update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); - ptimer->cntvoff = 0; - - hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - timer->bg_timer.function = kvm_bg_timer_expire; - - hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - vtimer->hrtimer.function = kvm_hrtimer_expire; - ptimer->hrtimer.function = kvm_hrtimer_expire; - - vtimer->irq.irq = default_vtimer_irq.irq; - ptimer->irq.irq = default_ptimer_irq.irq; - - vtimer->host_timer_irq = host_vtimer_irq; - ptimer->host_timer_irq = host_ptimer_irq; - - vtimer->host_timer_irq_flags = host_vtimer_irq_flags; - ptimer->host_timer_irq_flags = host_ptimer_irq_flags; - - vtimer->vcpu = vcpu; - ptimer->vcpu = vcpu; -} - -static void kvm_timer_init_interrupt(void *info) -{ - enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); - enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); -} - -int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) -{ - struct arch_timer_context *timer; - - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - timer = vcpu_vtimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); - break; - case KVM_REG_ARM_TIMER_CNT: - timer = vcpu_vtimer(vcpu); - update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); - break; - case KVM_REG_ARM_TIMER_CVAL: - timer = vcpu_vtimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); - break; - case KVM_REG_ARM_PTIMER_CTL: - timer = vcpu_ptimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); - break; - case KVM_REG_ARM_PTIMER_CVAL: - timer = vcpu_ptimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); - break; - - default: - return -1; - } - - return 0; -} - -static u64 read_timer_ctl(struct arch_timer_context *timer) -{ - /* - * Set ISTATUS bit if it's expired. - * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is - * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit - * regardless of ENABLE bit for our implementation convenience. - */ - if (!kvm_timer_compute_delta(timer)) - return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT; - else - return timer->cnt_ctl; -} - -u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) -{ - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - return kvm_arm_timer_read(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CTL); - case KVM_REG_ARM_TIMER_CNT: - return kvm_arm_timer_read(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CNT); - case KVM_REG_ARM_TIMER_CVAL: - return kvm_arm_timer_read(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CVAL); - case KVM_REG_ARM_PTIMER_CTL: - return kvm_arm_timer_read(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CTL); - case KVM_REG_ARM_PTIMER_CNT: - return kvm_arm_timer_read(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CNT); - case KVM_REG_ARM_PTIMER_CVAL: - return kvm_arm_timer_read(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CVAL); - } - return (u64)-1; -} - -static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, - struct arch_timer_context *timer, - enum kvm_arch_timer_regs treg) -{ - u64 val; - - switch (treg) { - case TIMER_REG_TVAL: - val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff; - val &= lower_32_bits(val); - break; - - case TIMER_REG_CTL: - val = read_timer_ctl(timer); - break; - - case TIMER_REG_CVAL: - val = timer->cnt_cval; - break; - - case TIMER_REG_CNT: - val = kvm_phys_timer_read() - timer->cntvoff; - break; - - default: - BUG(); - } - - return val; -} - -u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, - enum kvm_arch_timers tmr, - enum kvm_arch_timer_regs treg) -{ - u64 val; - - preempt_disable(); - kvm_timer_vcpu_put(vcpu); - - val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg); - - kvm_timer_vcpu_load(vcpu); - preempt_enable(); - - return val; -} - -static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, - struct arch_timer_context *timer, - enum kvm_arch_timer_regs treg, - u64 val) -{ - switch (treg) { - case TIMER_REG_TVAL: - timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val; - break; - - case TIMER_REG_CTL: - timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT; - break; - - case TIMER_REG_CVAL: - timer->cnt_cval = val; - break; - - default: - BUG(); - } -} - -void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, - enum kvm_arch_timers tmr, - enum kvm_arch_timer_regs treg, - u64 val) -{ - preempt_disable(); - kvm_timer_vcpu_put(vcpu); - - kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val); - - kvm_timer_vcpu_load(vcpu); - preempt_enable(); -} - -static int kvm_timer_starting_cpu(unsigned int cpu) -{ - kvm_timer_init_interrupt(NULL); - return 0; -} - -static int kvm_timer_dying_cpu(unsigned int cpu) -{ - disable_percpu_irq(host_vtimer_irq); - return 0; -} - -int kvm_timer_hyp_init(bool has_gic) -{ - struct arch_timer_kvm_info *info; - int err; - - info = arch_timer_get_kvm_info(); - timecounter = &info->timecounter; - - if (!timecounter->cc) { - kvm_err("kvm_arch_timer: uninitialized timecounter\n"); - return -ENODEV; - } - - /* First, do the virtual EL1 timer irq */ - - if (info->virtual_irq <= 0) { - kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", - info->virtual_irq); - return -ENODEV; - } - host_vtimer_irq = info->virtual_irq; - - host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); - if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && - host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { - kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n", - host_vtimer_irq); - host_vtimer_irq_flags = IRQF_TRIGGER_LOW; - } - - err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, - "kvm guest vtimer", kvm_get_running_vcpus()); - if (err) { - kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n", - host_vtimer_irq, err); - return err; - } - - if (has_gic) { - err = irq_set_vcpu_affinity(host_vtimer_irq, - kvm_get_running_vcpus()); - if (err) { - kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); - goto out_free_irq; - } - - static_branch_enable(&has_gic_active_state); - } - - kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); - - /* Now let's do the physical EL1 timer irq */ - - if (info->physical_irq > 0) { - host_ptimer_irq = info->physical_irq; - host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq); - if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH && - host_ptimer_irq_flags != IRQF_TRIGGER_LOW) { - kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n", - host_ptimer_irq); - host_ptimer_irq_flags = IRQF_TRIGGER_LOW; - } - - err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler, - "kvm guest ptimer", kvm_get_running_vcpus()); - if (err) { - kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", - host_ptimer_irq, err); - return err; - } - - if (has_gic) { - err = irq_set_vcpu_affinity(host_ptimer_irq, - kvm_get_running_vcpus()); - if (err) { - kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); - goto out_free_irq; - } - } - - kvm_debug("physical timer IRQ%d\n", host_ptimer_irq); - } else if (has_vhe()) { - kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", - info->physical_irq); - err = -ENODEV; - goto out_free_irq; - } - - cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, - "kvm/arm/timer:starting", kvm_timer_starting_cpu, - kvm_timer_dying_cpu); - return 0; -out_free_irq: - free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); - return err; -} - -void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - - soft_timer_cancel(&timer->bg_timer); -} - -static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) -{ - int vtimer_irq, ptimer_irq; - int i, ret; - - vtimer_irq = vcpu_vtimer(vcpu)->irq.irq; - ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu)); - if (ret) - return false; - - ptimer_irq = vcpu_ptimer(vcpu)->irq.irq; - ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu)); - if (ret) - return false; - - kvm_for_each_vcpu(i, vcpu, vcpu->kvm) { - if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq || - vcpu_ptimer(vcpu)->irq.irq != ptimer_irq) - return false; - } - - return true; -} - -bool kvm_arch_timer_get_input_level(int vintid) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - struct arch_timer_context *timer; - - if (vintid == vcpu_vtimer(vcpu)->irq.irq) - timer = vcpu_vtimer(vcpu); - else if (vintid == vcpu_ptimer(vcpu)->irq.irq) - timer = vcpu_ptimer(vcpu); - else - BUG(); - - return kvm_timer_should_fire(timer); -} - -int kvm_timer_enable(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct timer_map map; - int ret; - - if (timer->enabled) - return 0; - - /* Without a VGIC we do not map virtual IRQs to physical IRQs */ - if (!irqchip_in_kernel(vcpu->kvm)) - goto no_vgic; - - if (!vgic_initialized(vcpu->kvm)) - return -ENODEV; - - if (!timer_irqs_are_valid(vcpu)) { - kvm_debug("incorrectly configured timer irqs\n"); - return -EINVAL; - } - - get_timer_map(vcpu, &map); - - ret = kvm_vgic_map_phys_irq(vcpu, - map.direct_vtimer->host_timer_irq, - map.direct_vtimer->irq.irq, - kvm_arch_timer_get_input_level); - if (ret) - return ret; - - if (map.direct_ptimer) { - ret = kvm_vgic_map_phys_irq(vcpu, - map.direct_ptimer->host_timer_irq, - map.direct_ptimer->irq.irq, - kvm_arch_timer_get_input_level); - } - - if (ret) - return ret; - -no_vgic: - timer->enabled = 1; - return 0; -} - -/* - * On VHE system, we only need to configure the EL2 timer trap register once, - * not for every world switch. - * The host kernel runs at EL2 with HCR_EL2.TGE == 1, - * and this makes those bits have no effect for the host kernel execution. - */ -void kvm_timer_init_vhe(void) -{ - /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */ - u32 cnthctl_shift = 10; - u64 val; - - /* - * VHE systems allow the guest direct access to the EL1 physical - * timer/counter. - */ - val = read_sysreg(cnthctl_el2); - val |= (CNTHCTL_EL1PCEN << cnthctl_shift); - val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); - write_sysreg(val, cnthctl_el2); -} - -static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq) -{ - struct kvm_vcpu *vcpu; - int i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu_vtimer(vcpu)->irq.irq = vtimer_irq; - vcpu_ptimer(vcpu)->irq.irq = ptimer_irq; - } -} - -int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - int __user *uaddr = (int __user *)(long)attr->addr; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - int irq; - - if (!irqchip_in_kernel(vcpu->kvm)) - return -EINVAL; - - if (get_user(irq, uaddr)) - return -EFAULT; - - if (!(irq_is_ppi(irq))) - return -EINVAL; - - if (vcpu->arch.timer_cpu.enabled) - return -EBUSY; - - switch (attr->attr) { - case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: - set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq); - break; - case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: - set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq); - break; - default: - return -ENXIO; - } - - return 0; -} - -int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - int __user *uaddr = (int __user *)(long)attr->addr; - struct arch_timer_context *timer; - int irq; - - switch (attr->attr) { - case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: - timer = vcpu_vtimer(vcpu); - break; - case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: - timer = vcpu_ptimer(vcpu); - break; - default: - return -ENXIO; - } - - irq = timer->irq.irq; - return put_user(irq, uaddr); -} - -int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: - case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: - return 0; - } - - return -ENXIO; -} diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c deleted file mode 100644 index 48d0ec44ad77..000000000000 --- a/virt/kvm/arm/arm.c +++ /dev/null @@ -1,1681 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include "trace.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#ifdef REQUIRES_VIRT -__asm__(".arch_extension virt"); -#endif - -DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); -static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); - -/* The VMID used in the VTTBR */ -static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); -static u32 kvm_next_vmid; -static DEFINE_SPINLOCK(kvm_vmid_lock); - -static bool vgic_present; - -static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); -DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); - -int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; -} - -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - -int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) -{ - int r; - - if (cap->flags) - return -EINVAL; - - switch (cap->cap) { - case KVM_CAP_ARM_NISV_TO_USER: - r = 0; - kvm->arch.return_nisv_io_abort_to_user = true; - break; - default: - r = -EINVAL; - break; - } - - return r; -} - -/** - * kvm_arch_init_vm - initializes a VM data structure - * @kvm: pointer to the KVM struct - */ -int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) -{ - int ret, cpu; - - ret = kvm_arm_setup_stage2(kvm, type); - if (ret) - return ret; - - kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran)); - if (!kvm->arch.last_vcpu_ran) - return -ENOMEM; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1; - - ret = kvm_alloc_stage2_pgd(kvm); - if (ret) - goto out_fail_alloc; - - ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); - if (ret) - goto out_free_stage2_pgd; - - kvm_vgic_early_init(kvm); - - /* Mark the initial VMID generation invalid */ - kvm->arch.vmid.vmid_gen = 0; - - /* The maximum number of VCPUs is limited by the host's GIC model */ - kvm->arch.max_vcpus = vgic_present ? - kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; - - return ret; -out_free_stage2_pgd: - kvm_free_stage2_pgd(kvm); -out_fail_alloc: - free_percpu(kvm->arch.last_vcpu_ran); - kvm->arch.last_vcpu_ran = NULL; - return ret; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - -vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) -{ - return VM_FAULT_SIGBUS; -} - - -/** - * kvm_arch_destroy_vm - destroy the VM data structure - * @kvm: pointer to the KVM struct - */ -void kvm_arch_destroy_vm(struct kvm *kvm) -{ - int i; - - kvm_vgic_destroy(kvm); - - free_percpu(kvm->arch.last_vcpu_ran); - kvm->arch.last_vcpu_ran = NULL; - - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_vcpu_destroy(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } - atomic_set(&kvm->online_vcpus, 0); -} - -int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) -{ - int r; - switch (ext) { - case KVM_CAP_IRQCHIP: - r = vgic_present; - break; - case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: - case KVM_CAP_USER_MEMORY: - case KVM_CAP_SYNC_MMU: - case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: - case KVM_CAP_ONE_REG: - case KVM_CAP_ARM_PSCI: - case KVM_CAP_ARM_PSCI_0_2: - case KVM_CAP_READONLY_MEM: - case KVM_CAP_MP_STATE: - case KVM_CAP_IMMEDIATE_EXIT: - case KVM_CAP_VCPU_EVENTS: - case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: - case KVM_CAP_ARM_NISV_TO_USER: - case KVM_CAP_ARM_INJECT_EXT_DABT: - r = 1; - break; - case KVM_CAP_ARM_SET_DEVICE_ADDR: - r = 1; - break; - case KVM_CAP_NR_VCPUS: - r = num_online_cpus(); - break; - case KVM_CAP_MAX_VCPUS: - r = KVM_MAX_VCPUS; - break; - case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; - break; - case KVM_CAP_MSI_DEVID: - if (!kvm) - r = -EINVAL; - else - r = kvm->arch.vgic.msis_require_devid; - break; - case KVM_CAP_ARM_USER_IRQ: - /* - * 1: EL1_VTIMER, EL1_PTIMER, and PMU. - * (bump this number if adding more devices) - */ - r = 1; - break; - default: - r = kvm_arch_vm_ioctl_check_extension(kvm, ext); - break; - } - return r; -} - -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - return -EINVAL; -} - -struct kvm *kvm_arch_alloc_vm(void) -{ - if (!has_vhe()) - return kzalloc(sizeof(struct kvm), GFP_KERNEL); - - return vzalloc(sizeof(struct kvm)); -} - -void kvm_arch_free_vm(struct kvm *kvm) -{ - if (!has_vhe()) - kfree(kvm); - else - vfree(kvm); -} - -int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) -{ - if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) - return -EBUSY; - - if (id >= kvm->arch.max_vcpus) - return -EINVAL; - - return 0; -} - -int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) -{ - int err; - - /* Force users to call KVM_ARM_VCPU_INIT */ - vcpu->arch.target = -1; - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); - - /* Set up the timer */ - kvm_timer_vcpu_init(vcpu); - - kvm_pmu_vcpu_init(vcpu); - - kvm_arm_reset_debug_ptr(vcpu); - - kvm_arm_pvtime_vcpu_init(&vcpu->arch); - - err = kvm_vgic_vcpu_init(vcpu); - if (err) - return err; - - return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); -} - -void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) -{ -} - -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) - static_branch_dec(&userspace_irqchip_in_use); - - kvm_mmu_free_memory_caches(vcpu); - kvm_timer_vcpu_terminate(vcpu); - kvm_pmu_vcpu_destroy(vcpu); - - kvm_arm_vcpu_destroy(vcpu); -} - -int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) -{ - return kvm_timer_is_pending(vcpu); -} - -void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) -{ - /* - * If we're about to block (most likely because we've just hit a - * WFI), we need to sync back the state of the GIC CPU interface - * so that we have the latest PMR and group enables. This ensures - * that kvm_arch_vcpu_runnable has up-to-date data to decide - * whether we have pending interrupts. - * - * For the same reason, we want to tell GICv4 that we need - * doorbells to be signalled, should an interrupt become pending. - */ - preempt_disable(); - kvm_vgic_vmcr_sync(vcpu); - vgic_v4_put(vcpu, true); - preempt_enable(); -} - -void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - vgic_v4_load(vcpu); - preempt_enable(); -} - -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - int *last_ran; - kvm_host_data_t *cpu_data; - - last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran); - cpu_data = this_cpu_ptr(&kvm_host_data); - - /* - * We might get preempted before the vCPU actually runs, but - * over-invalidation doesn't affect correctness. - */ - if (*last_ran != vcpu->vcpu_id) { - kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu); - *last_ran = vcpu->vcpu_id; - } - - vcpu->cpu = cpu; - vcpu->arch.host_cpu_context = &cpu_data->host_ctxt; - - kvm_vgic_load(vcpu); - kvm_timer_vcpu_load(vcpu); - kvm_vcpu_load_sysregs(vcpu); - kvm_arch_vcpu_load_fp(vcpu); - kvm_vcpu_pmu_restore_guest(vcpu); - if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) - kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); - - if (single_task_running()) - vcpu_clear_wfx_traps(vcpu); - else - vcpu_set_wfx_traps(vcpu); - - vcpu_ptrauth_setup_lazy(vcpu); -} - -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_put_fp(vcpu); - kvm_vcpu_put_sysregs(vcpu); - kvm_timer_vcpu_put(vcpu); - kvm_vgic_put(vcpu); - kvm_vcpu_pmu_restore_host(vcpu); - - vcpu->cpu = -1; -} - -static void vcpu_power_off(struct kvm_vcpu *vcpu) -{ - vcpu->arch.power_off = true; - kvm_make_request(KVM_REQ_SLEEP, vcpu); - kvm_vcpu_kick(vcpu); -} - -int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - if (vcpu->arch.power_off) - mp_state->mp_state = KVM_MP_STATE_STOPPED; - else - mp_state->mp_state = KVM_MP_STATE_RUNNABLE; - - return 0; -} - -int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - int ret = 0; - - switch (mp_state->mp_state) { - case KVM_MP_STATE_RUNNABLE: - vcpu->arch.power_off = false; - break; - case KVM_MP_STATE_STOPPED: - vcpu_power_off(vcpu); - break; - default: - ret = -EINVAL; - } - - return ret; -} - -/** - * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled - * @v: The VCPU pointer - * - * If the guest CPU is not waiting for interrupts or an interrupt line is - * asserted, the CPU is by definition runnable. - */ -int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) -{ - bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); - return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) - && !v->arch.power_off && !v->arch.pause); -} - -bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) -{ - return vcpu_mode_priv(vcpu); -} - -/* Just ensure a guest exit from a particular CPU */ -static void exit_vm_noop(void *info) -{ -} - -void force_vm_exit(const cpumask_t *mask) -{ - preempt_disable(); - smp_call_function_many(mask, exit_vm_noop, NULL, true); - preempt_enable(); -} - -/** - * need_new_vmid_gen - check that the VMID is still valid - * @vmid: The VMID to check - * - * return true if there is a new generation of VMIDs being used - * - * The hardware supports a limited set of values with the value zero reserved - * for the host, so we check if an assigned value belongs to a previous - * generation, which which requires us to assign a new value. If we're the - * first to use a VMID for the new generation, we must flush necessary caches - * and TLBs on all CPUs. - */ -static bool need_new_vmid_gen(struct kvm_vmid *vmid) -{ - u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); - smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ - return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen); -} - -/** - * update_vmid - Update the vmid with a valid VMID for the current generation - * @kvm: The guest that struct vmid belongs to - * @vmid: The stage-2 VMID information struct - */ -static void update_vmid(struct kvm_vmid *vmid) -{ - if (!need_new_vmid_gen(vmid)) - return; - - spin_lock(&kvm_vmid_lock); - - /* - * We need to re-check the vmid_gen here to ensure that if another vcpu - * already allocated a valid vmid for this vm, then this vcpu should - * use the same vmid. - */ - if (!need_new_vmid_gen(vmid)) { - spin_unlock(&kvm_vmid_lock); - return; - } - - /* First user of a new VMID generation? */ - if (unlikely(kvm_next_vmid == 0)) { - atomic64_inc(&kvm_vmid_gen); - kvm_next_vmid = 1; - - /* - * On SMP we know no other CPUs can use this CPU's or each - * other's VMID after force_vm_exit returns since the - * kvm_vmid_lock blocks them from reentry to the guest. - */ - force_vm_exit(cpu_all_mask); - /* - * Now broadcast TLB + ICACHE invalidation over the inner - * shareable domain to make sure all data structures are - * clean. - */ - kvm_call_hyp(__kvm_flush_vm_context); - } - - vmid->vmid = kvm_next_vmid; - kvm_next_vmid++; - kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; - - smp_wmb(); - WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen)); - - spin_unlock(&kvm_vmid_lock); -} - -static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - int ret = 0; - - if (likely(vcpu->arch.has_run_once)) - return 0; - - if (!kvm_arm_vcpu_is_finalized(vcpu)) - return -EPERM; - - vcpu->arch.has_run_once = true; - - if (likely(irqchip_in_kernel(kvm))) { - /* - * Map the VGIC hardware resources before running a vcpu the - * first time on this VM. - */ - if (unlikely(!vgic_ready(kvm))) { - ret = kvm_vgic_map_resources(kvm); - if (ret) - return ret; - } - } else { - /* - * Tell the rest of the code that there are userspace irqchip - * VMs in the wild. - */ - static_branch_inc(&userspace_irqchip_in_use); - } - - ret = kvm_timer_enable(vcpu); - if (ret) - return ret; - - ret = kvm_arm_pmu_v3_enable(vcpu); - - return ret; -} - -bool kvm_arch_intc_initialized(struct kvm *kvm) -{ - return vgic_initialized(kvm); -} - -void kvm_arm_halt_guest(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) - vcpu->arch.pause = true; - kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP); -} - -void kvm_arm_resume_guest(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu->arch.pause = false; - swake_up_one(kvm_arch_vcpu_wq(vcpu)); - } -} - -static void vcpu_req_sleep(struct kvm_vcpu *vcpu) -{ - struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); - - swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) && - (!vcpu->arch.pause))); - - if (vcpu->arch.power_off || vcpu->arch.pause) { - /* Awaken to handle a signal, request we sleep again later. */ - kvm_make_request(KVM_REQ_SLEEP, vcpu); - } - - /* - * Make sure we will observe a potential reset request if we've - * observed a change to the power state. Pairs with the smp_wmb() in - * kvm_psci_vcpu_on(). - */ - smp_rmb(); -} - -static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.target >= 0; -} - -static void check_vcpu_requests(struct kvm_vcpu *vcpu) -{ - if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) - vcpu_req_sleep(vcpu); - - if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) - kvm_reset_vcpu(vcpu); - - /* - * Clear IRQ_PENDING requests that were made to guarantee - * that a VCPU sees new virtual interrupts. - */ - kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); - - if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) - kvm_update_stolen_time(vcpu); - - if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { - /* The distributor enable bits were changed */ - preempt_disable(); - vgic_v4_put(vcpu, false); - vgic_v4_load(vcpu); - preempt_enable(); - } - } -} - -/** - * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code - * @vcpu: The VCPU pointer - * @run: The kvm_run structure pointer used for userspace state exchange - * - * This function is called through the VCPU_RUN ioctl called from user space. It - * will execute VM code in a loop until the time slice for the process is used - * or some emulation is needed from user space in which case the function will - * return with return value 0 and with the kvm_run structure filled in with the - * required data for the requested emulation. - */ -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - int ret; - - if (unlikely(!kvm_vcpu_initialized(vcpu))) - return -ENOEXEC; - - ret = kvm_vcpu_first_run_init(vcpu); - if (ret) - return ret; - - if (run->exit_reason == KVM_EXIT_MMIO) { - ret = kvm_handle_mmio_return(vcpu, vcpu->run); - if (ret) - return ret; - } - - if (run->immediate_exit) - return -EINTR; - - vcpu_load(vcpu); - - kvm_sigset_activate(vcpu); - - ret = 1; - run->exit_reason = KVM_EXIT_UNKNOWN; - while (ret > 0) { - /* - * Check conditions before entering the guest - */ - cond_resched(); - - update_vmid(&vcpu->kvm->arch.vmid); - - check_vcpu_requests(vcpu); - - /* - * Preparing the interrupts to be injected also - * involves poking the GIC, which must be done in a - * non-preemptible context. - */ - preempt_disable(); - - kvm_pmu_flush_hwstate(vcpu); - - local_irq_disable(); - - kvm_vgic_flush_hwstate(vcpu); - - /* - * Exit if we have a signal pending so that we can deliver the - * signal to user space. - */ - if (signal_pending(current)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - - /* - * If we're using a userspace irqchip, then check if we need - * to tell a userspace irqchip about timer or PMU level - * changes and if so, exit to userspace (the actual level - * state gets updated in kvm_timer_update_run and - * kvm_pmu_update_run below). - */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { - if (kvm_timer_should_notify_user(vcpu) || - kvm_pmu_should_notify_user(vcpu)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - } - - /* - * Ensure we set mode to IN_GUEST_MODE after we disable - * interrupts and before the final VCPU requests check. - * See the comment in kvm_vcpu_exiting_guest_mode() and - * Documentation/virt/kvm/vcpu-requests.rst - */ - smp_store_mb(vcpu->mode, IN_GUEST_MODE); - - if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) || - kvm_request_pending(vcpu)) { - vcpu->mode = OUTSIDE_GUEST_MODE; - isb(); /* Ensure work in x_flush_hwstate is committed */ - kvm_pmu_sync_hwstate(vcpu); - if (static_branch_unlikely(&userspace_irqchip_in_use)) - kvm_timer_sync_hwstate(vcpu); - kvm_vgic_sync_hwstate(vcpu); - local_irq_enable(); - preempt_enable(); - continue; - } - - kvm_arm_setup_debug(vcpu); - - /************************************************************** - * Enter the guest - */ - trace_kvm_entry(*vcpu_pc(vcpu)); - guest_enter_irqoff(); - - if (has_vhe()) { - ret = kvm_vcpu_run_vhe(vcpu); - } else { - ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu); - } - - vcpu->mode = OUTSIDE_GUEST_MODE; - vcpu->stat.exits++; - /* - * Back from guest - *************************************************************/ - - kvm_arm_clear_debug(vcpu); - - /* - * We must sync the PMU state before the vgic state so - * that the vgic can properly sample the updated state of the - * interrupt line. - */ - kvm_pmu_sync_hwstate(vcpu); - - /* - * Sync the vgic state before syncing the timer state because - * the timer code needs to know if the virtual timer - * interrupts are active. - */ - kvm_vgic_sync_hwstate(vcpu); - - /* - * Sync the timer hardware state before enabling interrupts as - * we don't want vtimer interrupts to race with syncing the - * timer virtual interrupt state. - */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) - kvm_timer_sync_hwstate(vcpu); - - kvm_arch_vcpu_ctxsync_fp(vcpu); - - /* - * We may have taken a host interrupt in HYP mode (ie - * while executing the guest). This interrupt is still - * pending, as we haven't serviced it yet! - * - * We're now back in SVC mode, with interrupts - * disabled. Enabling the interrupts now will have - * the effect of taking the interrupt again, in SVC - * mode this time. - */ - local_irq_enable(); - - /* - * We do local_irq_enable() before calling guest_exit() so - * that if a timer interrupt hits while running the guest we - * account that tick as being spent in the guest. We enable - * preemption after calling guest_exit() so that if we get - * preempted we make sure ticks after that is not counted as - * guest time. - */ - guest_exit(); - trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); - - /* Exit types that need handling before we can be preempted */ - handle_exit_early(vcpu, run, ret); - - preempt_enable(); - - ret = handle_exit(vcpu, run, ret); - } - - /* Tell userspace about in-kernel device output levels */ - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { - kvm_timer_update_run(vcpu); - kvm_pmu_update_run(vcpu); - } - - kvm_sigset_deactivate(vcpu); - - vcpu_put(vcpu); - return ret; -} - -static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) -{ - int bit_index; - bool set; - unsigned long *hcr; - - if (number == KVM_ARM_IRQ_CPU_IRQ) - bit_index = __ffs(HCR_VI); - else /* KVM_ARM_IRQ_CPU_FIQ */ - bit_index = __ffs(HCR_VF); - - hcr = vcpu_hcr(vcpu); - if (level) - set = test_and_set_bit(bit_index, hcr); - else - set = test_and_clear_bit(bit_index, hcr); - - /* - * If we didn't change anything, no need to wake up or kick other CPUs - */ - if (set == level) - return 0; - - /* - * The vcpu irq_lines field was updated, wake up sleeping VCPUs and - * trigger a world-switch round on the running physical CPU to set the - * virtual IRQ/FIQ fields in the HCR appropriately. - */ - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - - return 0; -} - -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, - bool line_status) -{ - u32 irq = irq_level->irq; - unsigned int irq_type, vcpu_idx, irq_num; - int nrcpus = atomic_read(&kvm->online_vcpus); - struct kvm_vcpu *vcpu = NULL; - bool level = irq_level->level; - - irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; - vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; - vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); - irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; - - trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level); - - switch (irq_type) { - case KVM_ARM_IRQ_TYPE_CPU: - if (irqchip_in_kernel(kvm)) - return -ENXIO; - - if (vcpu_idx >= nrcpus) - return -EINVAL; - - vcpu = kvm_get_vcpu(kvm, vcpu_idx); - if (!vcpu) - return -EINVAL; - - if (irq_num > KVM_ARM_IRQ_CPU_FIQ) - return -EINVAL; - - return vcpu_interrupt_line(vcpu, irq_num, level); - case KVM_ARM_IRQ_TYPE_PPI: - if (!irqchip_in_kernel(kvm)) - return -ENXIO; - - if (vcpu_idx >= nrcpus) - return -EINVAL; - - vcpu = kvm_get_vcpu(kvm, vcpu_idx); - if (!vcpu) - return -EINVAL; - - if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) - return -EINVAL; - - return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL); - case KVM_ARM_IRQ_TYPE_SPI: - if (!irqchip_in_kernel(kvm)) - return -ENXIO; - - if (irq_num < VGIC_NR_PRIVATE_IRQS) - return -EINVAL; - - return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL); - } - - return -EINVAL; -} - -static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, - const struct kvm_vcpu_init *init) -{ - unsigned int i, ret; - int phys_target = kvm_target_cpu(); - - if (init->target != phys_target) - return -EINVAL; - - /* - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must - * use the same target. - */ - if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) - return -EINVAL; - - /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ - for (i = 0; i < sizeof(init->features) * 8; i++) { - bool set = (init->features[i / 32] & (1 << (i % 32))); - - if (set && i >= KVM_VCPU_MAX_FEATURES) - return -ENOENT; - - /* - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must - * use the same feature set. - */ - if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && - test_bit(i, vcpu->arch.features) != set) - return -EINVAL; - - if (set) - set_bit(i, vcpu->arch.features); - } - - vcpu->arch.target = phys_target; - - /* Now we know what it is, we can reset it. */ - ret = kvm_reset_vcpu(vcpu); - if (ret) { - vcpu->arch.target = -1; - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); - } - - return ret; -} - -static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, - struct kvm_vcpu_init *init) -{ - int ret; - - ret = kvm_vcpu_set_target(vcpu, init); - if (ret) - return ret; - - /* - * Ensure a rebooted VM will fault in RAM pages and detect if the - * guest MMU is turned off and flush the caches as needed. - */ - if (vcpu->arch.has_run_once) - stage2_unmap_vm(vcpu->kvm); - - vcpu_reset_hcr(vcpu); - - /* - * Handle the "start in power-off" case. - */ - if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - vcpu_power_off(vcpu); - else - vcpu->arch.power_off = false; - - return 0; -} - -static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - int ret = -ENXIO; - - switch (attr->group) { - default: - ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr); - break; - } - - return ret; -} - -static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - int ret = -ENXIO; - - switch (attr->group) { - default: - ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr); - break; - } - - return ret; -} - -static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - int ret = -ENXIO; - - switch (attr->group) { - default: - ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr); - break; - } - - return ret; -} - -static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - memset(events, 0, sizeof(*events)); - - return __kvm_arm_vcpu_get_events(vcpu, events); -} - -static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - int i; - - /* check whether the reserved field is zero */ - for (i = 0; i < ARRAY_SIZE(events->reserved); i++) - if (events->reserved[i]) - return -EINVAL; - - /* check whether the pad field is zero */ - for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) - if (events->exception.pad[i]) - return -EINVAL; - - return __kvm_arm_vcpu_set_events(vcpu, events); -} - -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; - struct kvm_device_attr attr; - long r; - - switch (ioctl) { - case KVM_ARM_VCPU_INIT: { - struct kvm_vcpu_init init; - - r = -EFAULT; - if (copy_from_user(&init, argp, sizeof(init))) - break; - - r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); - break; - } - case KVM_SET_ONE_REG: - case KVM_GET_ONE_REG: { - struct kvm_one_reg reg; - - r = -ENOEXEC; - if (unlikely(!kvm_vcpu_initialized(vcpu))) - break; - - r = -EFAULT; - if (copy_from_user(®, argp, sizeof(reg))) - break; - - if (ioctl == KVM_SET_ONE_REG) - r = kvm_arm_set_reg(vcpu, ®); - else - r = kvm_arm_get_reg(vcpu, ®); - break; - } - case KVM_GET_REG_LIST: { - struct kvm_reg_list __user *user_list = argp; - struct kvm_reg_list reg_list; - unsigned n; - - r = -ENOEXEC; - if (unlikely(!kvm_vcpu_initialized(vcpu))) - break; - - r = -EPERM; - if (!kvm_arm_vcpu_is_finalized(vcpu)) - break; - - r = -EFAULT; - if (copy_from_user(®_list, user_list, sizeof(reg_list))) - break; - n = reg_list.n; - reg_list.n = kvm_arm_num_regs(vcpu); - if (copy_to_user(user_list, ®_list, sizeof(reg_list))) - break; - r = -E2BIG; - if (n < reg_list.n) - break; - r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); - break; - } - case KVM_SET_DEVICE_ATTR: { - r = -EFAULT; - if (copy_from_user(&attr, argp, sizeof(attr))) - break; - r = kvm_arm_vcpu_set_attr(vcpu, &attr); - break; - } - case KVM_GET_DEVICE_ATTR: { - r = -EFAULT; - if (copy_from_user(&attr, argp, sizeof(attr))) - break; - r = kvm_arm_vcpu_get_attr(vcpu, &attr); - break; - } - case KVM_HAS_DEVICE_ATTR: { - r = -EFAULT; - if (copy_from_user(&attr, argp, sizeof(attr))) - break; - r = kvm_arm_vcpu_has_attr(vcpu, &attr); - break; - } - case KVM_GET_VCPU_EVENTS: { - struct kvm_vcpu_events events; - - if (kvm_arm_vcpu_get_events(vcpu, &events)) - return -EINVAL; - - if (copy_to_user(argp, &events, sizeof(events))) - return -EFAULT; - - return 0; - } - case KVM_SET_VCPU_EVENTS: { - struct kvm_vcpu_events events; - - if (copy_from_user(&events, argp, sizeof(events))) - return -EFAULT; - - return kvm_arm_vcpu_set_events(vcpu, &events); - } - case KVM_ARM_VCPU_FINALIZE: { - int what; - - if (!kvm_vcpu_initialized(vcpu)) - return -ENOEXEC; - - if (get_user(what, (const int __user *)argp)) - return -EFAULT; - - return kvm_arm_vcpu_finalize(vcpu, what); - } - default: - r = -EINVAL; - } - - return r; -} - -void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) -{ - -} - -void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - kvm_flush_remote_tlbs(kvm); -} - -static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, - struct kvm_arm_device_addr *dev_addr) -{ - unsigned long dev_id, type; - - dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >> - KVM_ARM_DEVICE_ID_SHIFT; - type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >> - KVM_ARM_DEVICE_TYPE_SHIFT; - - switch (dev_id) { - case KVM_ARM_DEVICE_VGIC_V2: - if (!vgic_present) - return -ENXIO; - return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); - default: - return -ENODEV; - } -} - -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; - - switch (ioctl) { - case KVM_CREATE_IRQCHIP: { - int ret; - if (!vgic_present) - return -ENXIO; - mutex_lock(&kvm->lock); - ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); - mutex_unlock(&kvm->lock); - return ret; - } - case KVM_ARM_SET_DEVICE_ADDR: { - struct kvm_arm_device_addr dev_addr; - - if (copy_from_user(&dev_addr, argp, sizeof(dev_addr))) - return -EFAULT; - return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); - } - case KVM_ARM_PREFERRED_TARGET: { - int err; - struct kvm_vcpu_init init; - - err = kvm_vcpu_preferred_target(&init); - if (err) - return err; - - if (copy_to_user(argp, &init, sizeof(init))) - return -EFAULT; - - return 0; - } - default: - return -EINVAL; - } -} - -static void cpu_init_hyp_mode(void) -{ - phys_addr_t pgd_ptr; - unsigned long hyp_stack_ptr; - unsigned long stack_page; - unsigned long vector_ptr; - - /* Switch from the HYP stub to our own HYP init vector */ - __hyp_set_vectors(kvm_get_idmap_vector()); - - pgd_ptr = kvm_mmu_get_httbr(); - stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); - hyp_stack_ptr = stack_page + PAGE_SIZE; - vector_ptr = (unsigned long)kvm_get_hyp_vector(); - - __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); - __cpu_init_stage2(); -} - -static void cpu_hyp_reset(void) -{ - if (!is_kernel_in_hyp_mode()) - __hyp_reset_vectors(); -} - -static void cpu_hyp_reinit(void) -{ - kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt); - - cpu_hyp_reset(); - - if (is_kernel_in_hyp_mode()) - kvm_timer_init_vhe(); - else - cpu_init_hyp_mode(); - - kvm_arm_init_debug(); - - if (vgic_present) - kvm_vgic_init_cpu_hardware(); -} - -static void _kvm_arch_hardware_enable(void *discard) -{ - if (!__this_cpu_read(kvm_arm_hardware_enabled)) { - cpu_hyp_reinit(); - __this_cpu_write(kvm_arm_hardware_enabled, 1); - } -} - -int kvm_arch_hardware_enable(void) -{ - _kvm_arch_hardware_enable(NULL); - return 0; -} - -static void _kvm_arch_hardware_disable(void *discard) -{ - if (__this_cpu_read(kvm_arm_hardware_enabled)) { - cpu_hyp_reset(); - __this_cpu_write(kvm_arm_hardware_enabled, 0); - } -} - -void kvm_arch_hardware_disable(void) -{ - _kvm_arch_hardware_disable(NULL); -} - -#ifdef CONFIG_CPU_PM -static int hyp_init_cpu_pm_notifier(struct notifier_block *self, - unsigned long cmd, - void *v) -{ - /* - * kvm_arm_hardware_enabled is left with its old value over - * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should - * re-enable hyp. - */ - switch (cmd) { - case CPU_PM_ENTER: - if (__this_cpu_read(kvm_arm_hardware_enabled)) - /* - * don't update kvm_arm_hardware_enabled here - * so that the hardware will be re-enabled - * when we resume. See below. - */ - cpu_hyp_reset(); - - return NOTIFY_OK; - case CPU_PM_ENTER_FAILED: - case CPU_PM_EXIT: - if (__this_cpu_read(kvm_arm_hardware_enabled)) - /* The hardware was enabled before suspend. */ - cpu_hyp_reinit(); - - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -static struct notifier_block hyp_init_cpu_pm_nb = { - .notifier_call = hyp_init_cpu_pm_notifier, -}; - -static void __init hyp_cpu_pm_init(void) -{ - cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); -} -static void __init hyp_cpu_pm_exit(void) -{ - cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); -} -#else -static inline void hyp_cpu_pm_init(void) -{ -} -static inline void hyp_cpu_pm_exit(void) -{ -} -#endif - -static int init_common_resources(void) -{ - kvm_set_ipa_limit(); - - return 0; -} - -static int init_subsystems(void) -{ - int err = 0; - - /* - * Enable hardware so that subsystem initialisation can access EL2. - */ - on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); - - /* - * Register CPU lower-power notifier - */ - hyp_cpu_pm_init(); - - /* - * Init HYP view of VGIC - */ - err = kvm_vgic_hyp_init(); - switch (err) { - case 0: - vgic_present = true; - break; - case -ENODEV: - case -ENXIO: - vgic_present = false; - err = 0; - break; - default: - goto out; - } - - /* - * Init HYP architected timer support - */ - err = kvm_timer_hyp_init(vgic_present); - if (err) - goto out; - - kvm_perf_init(); - kvm_coproc_table_init(); - -out: - on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); - - return err; -} - -static void teardown_hyp_mode(void) -{ - int cpu; - - free_hyp_pgds(); - for_each_possible_cpu(cpu) - free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); -} - -/** - * Inits Hyp-mode on all online CPUs - */ -static int init_hyp_mode(void) -{ - int cpu; - int err = 0; - - /* - * Allocate Hyp PGD and setup Hyp identity mapping - */ - err = kvm_mmu_init(); - if (err) - goto out_err; - - /* - * Allocate stack pages for Hypervisor-mode - */ - for_each_possible_cpu(cpu) { - unsigned long stack_page; - - stack_page = __get_free_page(GFP_KERNEL); - if (!stack_page) { - err = -ENOMEM; - goto out_err; - } - - per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; - } - - /* - * Map the Hyp-code called directly from the host - */ - err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), - kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); - if (err) { - kvm_err("Cannot map world-switch code\n"); - goto out_err; - } - - err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), - kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); - if (err) { - kvm_err("Cannot map rodata section\n"); - goto out_err; - } - - err = create_hyp_mappings(kvm_ksym_ref(__bss_start), - kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); - if (err) { - kvm_err("Cannot map bss section\n"); - goto out_err; - } - - err = kvm_map_vectors(); - if (err) { - kvm_err("Cannot map vectors\n"); - goto out_err; - } - - /* - * Map the Hyp stack pages - */ - for_each_possible_cpu(cpu) { - char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); - err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, - PAGE_HYP); - - if (err) { - kvm_err("Cannot map hyp stack\n"); - goto out_err; - } - } - - for_each_possible_cpu(cpu) { - kvm_host_data_t *cpu_data; - - cpu_data = per_cpu_ptr(&kvm_host_data, cpu); - err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP); - - if (err) { - kvm_err("Cannot map host CPU state: %d\n", err); - goto out_err; - } - } - - err = hyp_map_aux_data(); - if (err) - kvm_err("Cannot map host auxiliary data: %d\n", err); - - return 0; - -out_err: - teardown_hyp_mode(); - kvm_err("error initializing Hyp mode: %d\n", err); - return err; -} - -static void check_kvm_target_cpu(void *ret) -{ - *(int *)ret = kvm_target_cpu(); -} - -struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) -{ - struct kvm_vcpu *vcpu; - int i; - - mpidr &= MPIDR_HWID_BITMASK; - kvm_for_each_vcpu(i, vcpu, kvm) { - if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) - return vcpu; - } - return NULL; -} - -bool kvm_arch_has_irq_bypass(void) -{ - return true; -} - -int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, - &irqfd->irq_entry); -} -void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, - &irqfd->irq_entry); -} - -void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - kvm_arm_halt_guest(irqfd->kvm); -} - -void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - kvm_arm_resume_guest(irqfd->kvm); -} - -/** - * Initialize Hyp-mode and memory mappings on all CPUs. - */ -int kvm_arch_init(void *opaque) -{ - int err; - int ret, cpu; - bool in_hyp_mode; - - if (!is_hyp_mode_available()) { - kvm_info("HYP mode not available\n"); - return -ENODEV; - } - - in_hyp_mode = is_kernel_in_hyp_mode(); - - if (!in_hyp_mode && kvm_arch_requires_vhe()) { - kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n"); - return -ENODEV; - } - - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); - if (ret < 0) { - kvm_err("Error, CPU %d not supported!\n", cpu); - return -ENODEV; - } - } - - err = init_common_resources(); - if (err) - return err; - - err = kvm_arm_init_sve(); - if (err) - return err; - - if (!in_hyp_mode) { - err = init_hyp_mode(); - if (err) - goto out_err; - } - - err = init_subsystems(); - if (err) - goto out_hyp; - - if (in_hyp_mode) - kvm_info("VHE mode initialized successfully\n"); - else - kvm_info("Hyp mode initialized successfully\n"); - - return 0; - -out_hyp: - hyp_cpu_pm_exit(); - if (!in_hyp_mode) - teardown_hyp_mode(); -out_err: - return err; -} - -/* NOP: Compiling as a module not supported */ -void kvm_arch_exit(void) -{ - kvm_perf_teardown(); -} - -static int arm_init(void) -{ - int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); - return rc; -} - -module_init(arm_init); diff --git a/virt/kvm/arm/hyp/aarch32.c b/virt/kvm/arm/hyp/aarch32.c deleted file mode 100644 index 25c0e47d57cb..000000000000 --- a/virt/kvm/arm/hyp/aarch32.c +++ /dev/null @@ -1,140 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Hyp portion of the (not much of an) Emulation layer for 32bit guests. - * - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier - * - * based on arch/arm/kvm/emulate.c - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#include -#include -#include - -/* - * stolen from arch/arm/kernel/opcodes.c - * - * condition code lookup table - * index into the table is test code: EQ, NE, ... LT, GT, AL, NV - * - * bit position in short is condition code: NZCV - */ -static const unsigned short cc_map[16] = { - 0xF0F0, /* EQ == Z set */ - 0x0F0F, /* NE */ - 0xCCCC, /* CS == C set */ - 0x3333, /* CC */ - 0xFF00, /* MI == N set */ - 0x00FF, /* PL */ - 0xAAAA, /* VS == V set */ - 0x5555, /* VC */ - 0x0C0C, /* HI == C set && Z clear */ - 0xF3F3, /* LS == C clear || Z set */ - 0xAA55, /* GE == (N==V) */ - 0x55AA, /* LT == (N!=V) */ - 0x0A05, /* GT == (!Z && (N==V)) */ - 0xF5FA, /* LE == (Z || (N!=V)) */ - 0xFFFF, /* AL always */ - 0 /* NV */ -}; - -/* - * Check if a trapped instruction should have been executed or not. - */ -bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu) -{ - unsigned long cpsr; - u32 cpsr_cond; - int cond; - - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_hsr(vcpu) >> 30) - return true; - - /* Is condition field valid? */ - cond = kvm_vcpu_get_condition(vcpu); - if (cond == 0xE) - return true; - - cpsr = *vcpu_cpsr(vcpu); - - if (cond < 0) { - /* This can happen in Thumb mode: examine IT state. */ - unsigned long it; - - it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); - - /* it == 0 => unconditional. */ - if (it == 0) - return true; - - /* The cond for this insn works out as the top 4 bits. */ - cond = (it >> 4); - } - - cpsr_cond = cpsr >> 28; - - if (!((cc_map[cond] >> cpsr_cond) & 1)) - return false; - - return true; -} - -/** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block - * @vcpu: The VCPU pointer - * - * When exceptions occur while instructions are executed in Thumb IF-THEN - * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have - * to do this little bit of work manually. The fields map like this: - * - * IT[7:0] -> CPSR[26:25],CPSR[15:10] - */ -static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu) -{ - unsigned long itbits, cond; - unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_arm = !(cpsr & PSR_AA32_T_BIT); - - if (is_arm || !(cpsr & PSR_AA32_IT_MASK)) - return; - - cond = (cpsr & 0xe000) >> 13; - itbits = (cpsr & 0x1c00) >> (10 - 2); - itbits |= (cpsr & (0x3 << 25)) >> 25; - - /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ - if ((itbits & 0x7) == 0) - itbits = cond = 0; - else - itbits = (itbits << 1) & 0x1f; - - cpsr &= ~PSR_AA32_IT_MASK; - cpsr |= cond << 13; - cpsr |= (itbits & 0x1c) << (10 - 2); - cpsr |= (itbits & 0x3) << 25; - *vcpu_cpsr(vcpu) = cpsr; -} - -/** - * kvm_skip_instr - skip a trapped instruction and proceed to the next - * @vcpu: The vcpu pointer - */ -void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - u32 pc = *vcpu_pc(vcpu); - bool is_thumb; - - is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT); - if (is_thumb && !is_wide_instr) - pc += 2; - else - pc += 4; - - *vcpu_pc(vcpu) = pc; - - kvm_adjust_itstate(vcpu); -} diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c deleted file mode 100644 index ff76e6845fe4..000000000000 --- a/virt/kvm/arm/hyp/timer-sr.c +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012-2015 - ARM Ltd - * Author: Marc Zyngier - */ - -#include -#include -#include - -#include - -void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high) -{ - u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low; - write_sysreg(cntvoff, cntvoff_el2); -} - -/* - * Should only be called on non-VHE systems. - * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). - */ -void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu) -{ - u64 val; - - /* Allow physical timer/counter access for the host */ - val = read_sysreg(cnthctl_el2); - val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; - write_sysreg(val, cnthctl_el2); -} - -/* - * Should only be called on non-VHE systems. - * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). - */ -void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu) -{ - u64 val; - - /* - * Disallow physical timer access for the guest - * Physical counter access is allowed - */ - val = read_sysreg(cnthctl_el2); - val &= ~CNTHCTL_EL1PCEN; - val |= CNTHCTL_EL1PCTEN; - write_sysreg(val, cnthctl_el2); -} diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c deleted file mode 100644 index ccf1fde9836c..000000000000 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ /dev/null @@ -1,1130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012-2015 - ARM Ltd - * Author: Marc Zyngier - */ - -#include -#include -#include - -#include -#include -#include - -#define vtr_to_max_lr_idx(v) ((v) & 0xf) -#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) -#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) - -static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) -{ - switch (lr & 0xf) { - case 0: - return read_gicreg(ICH_LR0_EL2); - case 1: - return read_gicreg(ICH_LR1_EL2); - case 2: - return read_gicreg(ICH_LR2_EL2); - case 3: - return read_gicreg(ICH_LR3_EL2); - case 4: - return read_gicreg(ICH_LR4_EL2); - case 5: - return read_gicreg(ICH_LR5_EL2); - case 6: - return read_gicreg(ICH_LR6_EL2); - case 7: - return read_gicreg(ICH_LR7_EL2); - case 8: - return read_gicreg(ICH_LR8_EL2); - case 9: - return read_gicreg(ICH_LR9_EL2); - case 10: - return read_gicreg(ICH_LR10_EL2); - case 11: - return read_gicreg(ICH_LR11_EL2); - case 12: - return read_gicreg(ICH_LR12_EL2); - case 13: - return read_gicreg(ICH_LR13_EL2); - case 14: - return read_gicreg(ICH_LR14_EL2); - case 15: - return read_gicreg(ICH_LR15_EL2); - } - - unreachable(); -} - -static void __hyp_text __gic_v3_set_lr(u64 val, int lr) -{ - switch (lr & 0xf) { - case 0: - write_gicreg(val, ICH_LR0_EL2); - break; - case 1: - write_gicreg(val, ICH_LR1_EL2); - break; - case 2: - write_gicreg(val, ICH_LR2_EL2); - break; - case 3: - write_gicreg(val, ICH_LR3_EL2); - break; - case 4: - write_gicreg(val, ICH_LR4_EL2); - break; - case 5: - write_gicreg(val, ICH_LR5_EL2); - break; - case 6: - write_gicreg(val, ICH_LR6_EL2); - break; - case 7: - write_gicreg(val, ICH_LR7_EL2); - break; - case 8: - write_gicreg(val, ICH_LR8_EL2); - break; - case 9: - write_gicreg(val, ICH_LR9_EL2); - break; - case 10: - write_gicreg(val, ICH_LR10_EL2); - break; - case 11: - write_gicreg(val, ICH_LR11_EL2); - break; - case 12: - write_gicreg(val, ICH_LR12_EL2); - break; - case 13: - write_gicreg(val, ICH_LR13_EL2); - break; - case 14: - write_gicreg(val, ICH_LR14_EL2); - break; - case 15: - write_gicreg(val, ICH_LR15_EL2); - break; - } -} - -static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n) -{ - switch (n) { - case 0: - write_gicreg(val, ICH_AP0R0_EL2); - break; - case 1: - write_gicreg(val, ICH_AP0R1_EL2); - break; - case 2: - write_gicreg(val, ICH_AP0R2_EL2); - break; - case 3: - write_gicreg(val, ICH_AP0R3_EL2); - break; - } -} - -static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n) -{ - switch (n) { - case 0: - write_gicreg(val, ICH_AP1R0_EL2); - break; - case 1: - write_gicreg(val, ICH_AP1R1_EL2); - break; - case 2: - write_gicreg(val, ICH_AP1R2_EL2); - break; - case 3: - write_gicreg(val, ICH_AP1R3_EL2); - break; - } -} - -static u32 __hyp_text __vgic_v3_read_ap0rn(int n) -{ - u32 val; - - switch (n) { - case 0: - val = read_gicreg(ICH_AP0R0_EL2); - break; - case 1: - val = read_gicreg(ICH_AP0R1_EL2); - break; - case 2: - val = read_gicreg(ICH_AP0R2_EL2); - break; - case 3: - val = read_gicreg(ICH_AP0R3_EL2); - break; - default: - unreachable(); - } - - return val; -} - -static u32 __hyp_text __vgic_v3_read_ap1rn(int n) -{ - u32 val; - - switch (n) { - case 0: - val = read_gicreg(ICH_AP1R0_EL2); - break; - case 1: - val = read_gicreg(ICH_AP1R1_EL2); - break; - case 2: - val = read_gicreg(ICH_AP1R2_EL2); - break; - case 3: - val = read_gicreg(ICH_AP1R3_EL2); - break; - default: - unreachable(); - } - - return val; -} - -void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - - /* - * Make sure stores to the GIC via the memory mapped interface - * are now visible to the system register interface when reading the - * LRs, and when reading back the VMCR on non-VHE systems. - */ - if (used_lrs || !has_vhe()) { - if (!cpu_if->vgic_sre) { - dsb(sy); - isb(); - } - } - - if (used_lrs || cpu_if->its_vpe.its_vm) { - int i; - u32 elrsr; - - elrsr = read_gicreg(ICH_ELRSR_EL2); - - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); - - for (i = 0; i < used_lrs; i++) { - if (elrsr & (1 << i)) - cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; - else - cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); - - __gic_v3_set_lr(0, i); - } - } -} - -void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - int i; - - if (used_lrs || cpu_if->its_vpe.its_vm) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); - - for (i = 0; i < used_lrs; i++) - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } - - /* - * Ensure that writes to the LRs, and on non-VHE systems ensure that - * the write to the VMCR in __vgic_v3_activate_traps(), will have - * reached the (re)distributors. This ensure the guest will read the - * correct values from the memory-mapped interface. - */ - if (used_lrs || !has_vhe()) { - if (!cpu_if->vgic_sre) { - isb(); - dsb(sy); - } - } -} - -void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - /* - * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a - * Group0 interrupt (as generated in GICv2 mode) to be - * delivered as a FIQ to the guest, with potentially fatal - * consequences. So we must make sure that ICC_SRE_EL1 has - * been actually programmed with the value we want before - * starting to mess with the rest of the GIC, and VMCR_EL2 in - * particular. This logic must be called before - * __vgic_v3_restore_state(). - */ - if (!cpu_if->vgic_sre) { - write_gicreg(0, ICC_SRE_EL1); - isb(); - write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); - - - if (has_vhe()) { - /* - * Ensure that the write to the VMCR will have reached - * the (re)distributors. This ensure the guest will - * read the correct values from the memory-mapped - * interface. - */ - isb(); - dsb(sy); - } - } - - /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. - */ - write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, - ICC_SRE_EL2); - - /* - * If we need to trap system registers, we must write - * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected, - */ - if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); -} - -void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val; - - if (!cpu_if->vgic_sre) { - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - } - - val = read_gicreg(ICC_SRE_EL2); - write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); - - if (!cpu_if->vgic_sre) { - /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ - isb(); - write_gicreg(1, ICC_SRE_EL1); - } - - /* - * If we were trapping system registers, we enabled the VGIC even if - * no interrupts were being injected, and we disable it again here. - */ - if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) - write_gicreg(0, ICH_HCR_EL2); -} - -void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if; - u64 val; - u32 nr_pre_bits; - - vcpu = kern_hyp_va(vcpu); - cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - val = read_gicreg(ICH_VTR_EL2); - nr_pre_bits = vtr_to_nr_pre_bits(val); - - switch (nr_pre_bits) { - case 7: - cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3); - cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); - /* Fall through */ - case 6: - cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1); - /* Fall through */ - default: - cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0); - } - - switch (nr_pre_bits) { - case 7: - cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3); - cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2); - /* Fall through */ - case 6: - cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1); - /* Fall through */ - default: - cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0); - } -} - -void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if; - u64 val; - u32 nr_pre_bits; - - vcpu = kern_hyp_va(vcpu); - cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - val = read_gicreg(ICH_VTR_EL2); - nr_pre_bits = vtr_to_nr_pre_bits(val); - - switch (nr_pre_bits) { - case 7: - __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3); - __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2); - /* Fall through */ - case 6: - __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1); - /* Fall through */ - default: - __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0); - } - - switch (nr_pre_bits) { - case 7: - __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3); - __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2); - /* Fall through */ - case 6: - __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1); - /* Fall through */ - default: - __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0); - } -} - -void __hyp_text __vgic_v3_init_lrs(void) -{ - int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); - int i; - - for (i = 0; i <= max_lr_idx; i++) - __gic_v3_set_lr(0, i); -} - -u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) -{ - return read_gicreg(ICH_VTR_EL2); -} - -u64 __hyp_text __vgic_v3_read_vmcr(void) -{ - return read_gicreg(ICH_VMCR_EL2); -} - -void __hyp_text __vgic_v3_write_vmcr(u32 vmcr) -{ - write_gicreg(vmcr, ICH_VMCR_EL2); -} - -#ifdef CONFIG_ARM64 - -static int __hyp_text __vgic_v3_bpr_min(void) -{ - /* See Pseudocode for VPriorityGroup */ - return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2)); -} - -static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu) -{ - u32 esr = kvm_vcpu_get_hsr(vcpu); - u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; - - return crm != 8; -} - -#define GICv3_IDLE_PRIORITY 0xff - -static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, - u32 vmcr, - u64 *lr_val) -{ - unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; - u8 priority = GICv3_IDLE_PRIORITY; - int i, lr = -1; - - for (i = 0; i < used_lrs; i++) { - u64 val = __gic_v3_get_lr(i); - u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; - - /* Not pending in the state? */ - if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT) - continue; - - /* Group-0 interrupt, but Group-0 disabled? */ - if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK)) - continue; - - /* Group-1 interrupt, but Group-1 disabled? */ - if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK)) - continue; - - /* Not the highest priority? */ - if (lr_prio >= priority) - continue; - - /* This is a candidate */ - priority = lr_prio; - *lr_val = val; - lr = i; - } - - if (lr == -1) - *lr_val = ICC_IAR1_EL1_SPURIOUS; - - return lr; -} - -static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, - int intid, u64 *lr_val) -{ - unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; - int i; - - for (i = 0; i < used_lrs; i++) { - u64 val = __gic_v3_get_lr(i); - - if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid && - (val & ICH_LR_ACTIVE_BIT)) { - *lr_val = val; - return i; - } - } - - *lr_val = ICC_IAR1_EL1_SPURIOUS; - return -1; -} - -static int __hyp_text __vgic_v3_get_highest_active_priority(void) -{ - u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); - u32 hap = 0; - int i; - - for (i = 0; i < nr_apr_regs; i++) { - u32 val; - - /* - * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers - * contain the active priority levels for this VCPU - * for the maximum number of supported priority - * levels, and we return the full priority level only - * if the BPR is programmed to its minimum, otherwise - * we return a combination of the priority level and - * subpriority, as determined by the setting of the - * BPR, but without the full subpriority. - */ - val = __vgic_v3_read_ap0rn(i); - val |= __vgic_v3_read_ap1rn(i); - if (!val) { - hap += 32; - continue; - } - - return (hap + __ffs(val)) << __vgic_v3_bpr_min(); - } - - return GICv3_IDLE_PRIORITY; -} - -static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr) -{ - return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; -} - -static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr) -{ - unsigned int bpr; - - if (vmcr & ICH_VMCR_CBPR_MASK) { - bpr = __vgic_v3_get_bpr0(vmcr); - if (bpr < 7) - bpr++; - } else { - bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; - } - - return bpr; -} - -/* - * Convert a priority to a preemption level, taking the relevant BPR - * into account by zeroing the sub-priority bits. - */ -static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp) -{ - unsigned int bpr; - - if (!grp) - bpr = __vgic_v3_get_bpr0(vmcr) + 1; - else - bpr = __vgic_v3_get_bpr1(vmcr); - - return pri & (GENMASK(7, 0) << bpr); -} - -/* - * The priority value is independent of any of the BPR values, so we - * normalize it using the minumal BPR value. This guarantees that no - * matter what the guest does with its BPR, we can always set/get the - * same value of a priority. - */ -static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp) -{ - u8 pre, ap; - u32 val; - int apr; - - pre = __vgic_v3_pri_to_pre(pri, vmcr, grp); - ap = pre >> __vgic_v3_bpr_min(); - apr = ap / 32; - - if (!grp) { - val = __vgic_v3_read_ap0rn(apr); - __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr); - } else { - val = __vgic_v3_read_ap1rn(apr); - __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr); - } -} - -static int __hyp_text __vgic_v3_clear_highest_active_priority(void) -{ - u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); - u32 hap = 0; - int i; - - for (i = 0; i < nr_apr_regs; i++) { - u32 ap0, ap1; - int c0, c1; - - ap0 = __vgic_v3_read_ap0rn(i); - ap1 = __vgic_v3_read_ap1rn(i); - if (!ap0 && !ap1) { - hap += 32; - continue; - } - - c0 = ap0 ? __ffs(ap0) : 32; - c1 = ap1 ? __ffs(ap1) : 32; - - /* Always clear the LSB, which is the highest priority */ - if (c0 < c1) { - ap0 &= ~BIT(c0); - __vgic_v3_write_ap0rn(ap0, i); - hap += c0; - } else { - ap1 &= ~BIT(c1); - __vgic_v3_write_ap1rn(ap1, i); - hap += c1; - } - - /* Rescale to 8 bits of priority */ - return hap << __vgic_v3_bpr_min(); - } - - return GICv3_IDLE_PRIORITY; -} - -static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 lr_val; - u8 lr_prio, pmr; - int lr, grp; - - grp = __vgic_v3_get_group(vcpu); - - lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); - if (lr < 0) - goto spurious; - - if (grp != !!(lr_val & ICH_LR_GROUP)) - goto spurious; - - pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; - lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; - if (pmr <= lr_prio) - goto spurious; - - if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp)) - goto spurious; - - lr_val &= ~ICH_LR_STATE; - /* No active state for LPIs */ - if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI) - lr_val |= ICH_LR_ACTIVE_BIT; - __gic_v3_set_lr(lr_val, lr); - __vgic_v3_set_active_priority(lr_prio, vmcr, grp); - vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); - return; - -spurious: - vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS); -} - -static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val) -{ - lr_val &= ~ICH_LR_ACTIVE_BIT; - if (lr_val & ICH_LR_HW) { - u32 pid; - - pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT; - gic_write_dir(pid); - } - - __gic_v3_set_lr(lr_val, lr); -} - -static void __hyp_text __vgic_v3_bump_eoicount(void) -{ - u32 hcr; - - hcr = read_gicreg(ICH_HCR_EL2); - hcr += 1 << ICH_HCR_EOIcount_SHIFT; - write_gicreg(hcr, ICH_HCR_EL2); -} - -static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 vid = vcpu_get_reg(vcpu, rt); - u64 lr_val; - int lr; - - /* EOImode == 0, nothing to be done here */ - if (!(vmcr & ICH_VMCR_EOIM_MASK)) - return; - - /* No deactivate to be performed on an LPI */ - if (vid >= VGIC_MIN_LPI) - return; - - lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); - if (lr == -1) { - __vgic_v3_bump_eoicount(); - return; - } - - __vgic_v3_clear_active_lr(lr, lr_val); -} - -static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u32 vid = vcpu_get_reg(vcpu, rt); - u64 lr_val; - u8 lr_prio, act_prio; - int lr, grp; - - grp = __vgic_v3_get_group(vcpu); - - /* Drop priority in any case */ - act_prio = __vgic_v3_clear_highest_active_priority(); - - /* If EOIing an LPI, no deactivate to be performed */ - if (vid >= VGIC_MIN_LPI) - return; - - /* EOImode == 1, nothing to be done here */ - if (vmcr & ICH_VMCR_EOIM_MASK) - return; - - lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); - if (lr == -1) { - __vgic_v3_bump_eoicount(); - return; - } - - lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; - - /* If priorities or group do not match, the guest has fscked-up. */ - if (grp != !!(lr_val & ICH_LR_GROUP) || - __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio) - return; - - /* Let's now perform the deactivation */ - __vgic_v3_clear_active_lr(lr, lr_val); -} - -static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK)); -} - -static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK)); -} - -static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 val = vcpu_get_reg(vcpu, rt); - - if (val & 1) - vmcr |= ICH_VMCR_ENG0_MASK; - else - vmcr &= ~ICH_VMCR_ENG0_MASK; - - __vgic_v3_write_vmcr(vmcr); -} - -static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 val = vcpu_get_reg(vcpu, rt); - - if (val & 1) - vmcr |= ICH_VMCR_ENG1_MASK; - else - vmcr &= ~ICH_VMCR_ENG1_MASK; - - __vgic_v3_write_vmcr(vmcr); -} - -static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr)); -} - -static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr)); -} - -static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 val = vcpu_get_reg(vcpu, rt); - u8 bpr_min = __vgic_v3_bpr_min() - 1; - - /* Enforce BPR limiting */ - if (val < bpr_min) - val = bpr_min; - - val <<= ICH_VMCR_BPR0_SHIFT; - val &= ICH_VMCR_BPR0_MASK; - vmcr &= ~ICH_VMCR_BPR0_MASK; - vmcr |= val; - - __vgic_v3_write_vmcr(vmcr); -} - -static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 val = vcpu_get_reg(vcpu, rt); - u8 bpr_min = __vgic_v3_bpr_min(); - - if (vmcr & ICH_VMCR_CBPR_MASK) - return; - - /* Enforce BPR limiting */ - if (val < bpr_min) - val = bpr_min; - - val <<= ICH_VMCR_BPR1_SHIFT; - val &= ICH_VMCR_BPR1_MASK; - vmcr &= ~ICH_VMCR_BPR1_MASK; - vmcr |= val; - - __vgic_v3_write_vmcr(vmcr); -} - -static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n) -{ - u32 val; - - if (!__vgic_v3_get_group(vcpu)) - val = __vgic_v3_read_ap0rn(n); - else - val = __vgic_v3_read_ap1rn(n); - - vcpu_set_reg(vcpu, rt, val); -} - -static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n) -{ - u32 val = vcpu_get_reg(vcpu, rt); - - if (!__vgic_v3_get_group(vcpu)) - __vgic_v3_write_ap0rn(val, n); - else - __vgic_v3_write_ap1rn(val, n); -} - -static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_read_apxrn(vcpu, rt, 0); -} - -static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_read_apxrn(vcpu, rt, 1); -} - -static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_read_apxrn(vcpu, rt, 2); -} - -static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_read_apxrn(vcpu, rt, 3); -} - -static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_write_apxrn(vcpu, rt, 0); -} - -static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_write_apxrn(vcpu, rt, 1); -} - -static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_write_apxrn(vcpu, rt, 2); -} - -static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_write_apxrn(vcpu, rt, 3); -} - -static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u64 lr_val; - int lr, lr_grp, grp; - - grp = __vgic_v3_get_group(vcpu); - - lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); - if (lr == -1) - goto spurious; - - lr_grp = !!(lr_val & ICH_LR_GROUP); - if (lr_grp != grp) - lr_val = ICC_IAR1_EL1_SPURIOUS; - -spurious: - vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); -} - -static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - vmcr &= ICH_VMCR_PMR_MASK; - vmcr >>= ICH_VMCR_PMR_SHIFT; - vcpu_set_reg(vcpu, rt, vmcr); -} - -static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 val = vcpu_get_reg(vcpu, rt); - - val <<= ICH_VMCR_PMR_SHIFT; - val &= ICH_VMCR_PMR_MASK; - vmcr &= ~ICH_VMCR_PMR_MASK; - vmcr |= val; - - write_gicreg(vmcr, ICH_VMCR_EL2); -} - -static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 val = __vgic_v3_get_highest_active_priority(); - vcpu_set_reg(vcpu, rt, val); -} - -static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 vtr, val; - - vtr = read_gicreg(ICH_VTR_EL2); - /* PRIbits */ - val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; - /* IDbits */ - val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT; - /* A3V */ - val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; - /* EOImode */ - val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT; - /* CBPR */ - val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; - - vcpu_set_reg(vcpu, rt, val); -} - -static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 val = vcpu_get_reg(vcpu, rt); - - if (val & ICC_CTLR_EL1_CBPR_MASK) - vmcr |= ICH_VMCR_CBPR_MASK; - else - vmcr &= ~ICH_VMCR_CBPR_MASK; - - if (val & ICC_CTLR_EL1_EOImode_MASK) - vmcr |= ICH_VMCR_EOIM_MASK; - else - vmcr &= ~ICH_VMCR_EOIM_MASK; - - write_gicreg(vmcr, ICH_VMCR_EL2); -} - -int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) -{ - int rt; - u32 esr; - u32 vmcr; - void (*fn)(struct kvm_vcpu *, u32, int); - bool is_read; - u32 sysreg; - - esr = kvm_vcpu_get_hsr(vcpu); - if (vcpu_mode_is_32bit(vcpu)) { - if (!kvm_condition_valid(vcpu)) { - __kvm_skip_instr(vcpu); - return 1; - } - - sysreg = esr_cp15_to_sysreg(esr); - } else { - sysreg = esr_sys64_to_sysreg(esr); - } - - is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; - - switch (sysreg) { - case SYS_ICC_IAR0_EL1: - case SYS_ICC_IAR1_EL1: - if (unlikely(!is_read)) - return 0; - fn = __vgic_v3_read_iar; - break; - case SYS_ICC_EOIR0_EL1: - case SYS_ICC_EOIR1_EL1: - if (unlikely(is_read)) - return 0; - fn = __vgic_v3_write_eoir; - break; - case SYS_ICC_IGRPEN1_EL1: - if (is_read) - fn = __vgic_v3_read_igrpen1; - else - fn = __vgic_v3_write_igrpen1; - break; - case SYS_ICC_BPR1_EL1: - if (is_read) - fn = __vgic_v3_read_bpr1; - else - fn = __vgic_v3_write_bpr1; - break; - case SYS_ICC_AP0Rn_EL1(0): - case SYS_ICC_AP1Rn_EL1(0): - if (is_read) - fn = __vgic_v3_read_apxr0; - else - fn = __vgic_v3_write_apxr0; - break; - case SYS_ICC_AP0Rn_EL1(1): - case SYS_ICC_AP1Rn_EL1(1): - if (is_read) - fn = __vgic_v3_read_apxr1; - else - fn = __vgic_v3_write_apxr1; - break; - case SYS_ICC_AP0Rn_EL1(2): - case SYS_ICC_AP1Rn_EL1(2): - if (is_read) - fn = __vgic_v3_read_apxr2; - else - fn = __vgic_v3_write_apxr2; - break; - case SYS_ICC_AP0Rn_EL1(3): - case SYS_ICC_AP1Rn_EL1(3): - if (is_read) - fn = __vgic_v3_read_apxr3; - else - fn = __vgic_v3_write_apxr3; - break; - case SYS_ICC_HPPIR0_EL1: - case SYS_ICC_HPPIR1_EL1: - if (unlikely(!is_read)) - return 0; - fn = __vgic_v3_read_hppir; - break; - case SYS_ICC_IGRPEN0_EL1: - if (is_read) - fn = __vgic_v3_read_igrpen0; - else - fn = __vgic_v3_write_igrpen0; - break; - case SYS_ICC_BPR0_EL1: - if (is_read) - fn = __vgic_v3_read_bpr0; - else - fn = __vgic_v3_write_bpr0; - break; - case SYS_ICC_DIR_EL1: - if (unlikely(is_read)) - return 0; - fn = __vgic_v3_write_dir; - break; - case SYS_ICC_RPR_EL1: - if (unlikely(!is_read)) - return 0; - fn = __vgic_v3_read_rpr; - break; - case SYS_ICC_CTLR_EL1: - if (is_read) - fn = __vgic_v3_read_ctlr; - else - fn = __vgic_v3_write_ctlr; - break; - case SYS_ICC_PMR_EL1: - if (is_read) - fn = __vgic_v3_read_pmr; - else - fn = __vgic_v3_write_pmr; - break; - default: - return 0; - } - - vmcr = __vgic_v3_read_vmcr(); - rt = kvm_vcpu_sys_get_rt(vcpu); - fn(vcpu, vmcr, rt); - - __kvm_skip_instr(vcpu); - - return 1; -} - -#endif diff --git a/virt/kvm/arm/hypercalls.c b/virt/kvm/arm/hypercalls.c deleted file mode 100644 index 550dfa3e53cd..000000000000 --- a/virt/kvm/arm/hypercalls.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2019 Arm Ltd. - -#include -#include - -#include - -#include -#include - -int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) -{ - u32 func_id = smccc_get_function(vcpu); - long val = SMCCC_RET_NOT_SUPPORTED; - u32 feature; - gpa_t gpa; - - switch (func_id) { - case ARM_SMCCC_VERSION_FUNC_ID: - val = ARM_SMCCC_VERSION_1_1; - break; - case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: - feature = smccc_get_arg1(vcpu); - switch (feature) { - case ARM_SMCCC_ARCH_WORKAROUND_1: - switch (kvm_arm_harden_branch_predictor()) { - case KVM_BP_HARDEN_UNKNOWN: - break; - case KVM_BP_HARDEN_WA_NEEDED: - val = SMCCC_RET_SUCCESS; - break; - case KVM_BP_HARDEN_NOT_REQUIRED: - val = SMCCC_RET_NOT_REQUIRED; - break; - } - break; - case ARM_SMCCC_ARCH_WORKAROUND_2: - switch (kvm_arm_have_ssbd()) { - case KVM_SSBD_FORCE_DISABLE: - case KVM_SSBD_UNKNOWN: - break; - case KVM_SSBD_KERNEL: - val = SMCCC_RET_SUCCESS; - break; - case KVM_SSBD_FORCE_ENABLE: - case KVM_SSBD_MITIGATED: - val = SMCCC_RET_NOT_REQUIRED; - break; - } - break; - case ARM_SMCCC_HV_PV_TIME_FEATURES: - val = SMCCC_RET_SUCCESS; - break; - } - break; - case ARM_SMCCC_HV_PV_TIME_FEATURES: - val = kvm_hypercall_pv_features(vcpu); - break; - case ARM_SMCCC_HV_PV_TIME_ST: - gpa = kvm_init_stolen_time(vcpu); - if (gpa != GPA_INVALID) - val = gpa; - break; - default: - return kvm_psci_call(vcpu); - } - - smccc_set_retval(vcpu, val, 0, 0, 0); - return 1; -} diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c deleted file mode 100644 index aedfcff99ac5..000000000000 --- a/virt/kvm/arm/mmio.c +++ /dev/null @@ -1,200 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#include -#include -#include - -#include "trace.h" - -void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) -{ - void *datap = NULL; - union { - u8 byte; - u16 hword; - u32 word; - u64 dword; - } tmp; - - switch (len) { - case 1: - tmp.byte = data; - datap = &tmp.byte; - break; - case 2: - tmp.hword = data; - datap = &tmp.hword; - break; - case 4: - tmp.word = data; - datap = &tmp.word; - break; - case 8: - tmp.dword = data; - datap = &tmp.dword; - break; - } - - memcpy(buf, datap, len); -} - -unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) -{ - unsigned long data = 0; - union { - u16 hword; - u32 word; - u64 dword; - } tmp; - - switch (len) { - case 1: - data = *(u8 *)buf; - break; - case 2: - memcpy(&tmp.hword, buf, len); - data = tmp.hword; - break; - case 4: - memcpy(&tmp.word, buf, len); - data = tmp.word; - break; - case 8: - memcpy(&tmp.dword, buf, len); - data = tmp.dword; - break; - } - - return data; -} - -/** - * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation - * or in-kernel IO emulation - * - * @vcpu: The VCPU pointer - * @run: The VCPU run struct containing the mmio data - */ -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - unsigned long data; - unsigned int len; - int mask; - - /* Detect an already handled MMIO return */ - if (unlikely(!vcpu->mmio_needed)) - return 0; - - vcpu->mmio_needed = 0; - - if (!kvm_vcpu_dabt_iswrite(vcpu)) { - len = kvm_vcpu_dabt_get_as(vcpu); - data = kvm_mmio_read_buf(run->mmio.data, len); - - if (kvm_vcpu_dabt_issext(vcpu) && - len < sizeof(unsigned long)) { - mask = 1U << ((len * 8) - 1); - data = (data ^ mask) - mask; - } - - if (!kvm_vcpu_dabt_issf(vcpu)) - data = data & 0xffffffff; - - trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, - &data); - data = vcpu_data_host_to_guest(vcpu, data, len); - vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); - } - - /* - * The MMIO instruction is emulated and should not be re-executed - * in the guest. - */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - - return 0; -} - -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa) -{ - unsigned long data; - unsigned long rt; - int ret; - bool is_write; - int len; - u8 data_buf[8]; - - /* - * No valid syndrome? Ask userspace for help if it has - * voluntered to do so, and bail out otherwise. - */ - if (!kvm_vcpu_dabt_isvalid(vcpu)) { - if (vcpu->kvm->arch.return_nisv_io_abort_to_user) { - run->exit_reason = KVM_EXIT_ARM_NISV; - run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); - run->arm_nisv.fault_ipa = fault_ipa; - return 0; - } - - kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n"); - return -ENOSYS; - } - - /* Page table accesses IO mem: tell guest to fix its TTBR */ - if (kvm_vcpu_dabt_iss1tw(vcpu)) { - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - return 1; - } - - /* - * Prepare MMIO operation. First decode the syndrome data we get - * from the CPU. Then try if some in-kernel emulation feels - * responsible, otherwise let user space do its magic. - */ - is_write = kvm_vcpu_dabt_iswrite(vcpu); - len = kvm_vcpu_dabt_get_as(vcpu); - rt = kvm_vcpu_dabt_get_rd(vcpu); - - if (is_write) { - data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), - len); - - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); - kvm_mmio_write_buf(data_buf, len, data); - - ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, - data_buf); - } else { - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, - fault_ipa, NULL); - - ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, - data_buf); - } - - /* Now prepare kvm_run for the potential return to userland. */ - run->mmio.is_write = is_write; - run->mmio.phys_addr = fault_ipa; - run->mmio.len = len; - vcpu->mmio_needed = 1; - - if (!ret) { - /* We handled the access successfully in the kernel. */ - if (!is_write) - memcpy(run->mmio.data, data_buf, len); - vcpu->stat.mmio_exit_kernel++; - kvm_handle_mmio_return(vcpu, run); - return 1; - } - - if (is_write) - memcpy(run->mmio.data, data_buf, len); - vcpu->stat.mmio_exit_user++; - run->exit_reason = KVM_EXIT_MMIO; - return 0; -} diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c deleted file mode 100644 index e3b9ee268823..000000000000 --- a/virt/kvm/arm/mmu.c +++ /dev/null @@ -1,2447 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -static pgd_t *boot_hyp_pgd; -static pgd_t *hyp_pgd; -static pgd_t *merged_hyp_pgd; -static DEFINE_MUTEX(kvm_hyp_pgd_mutex); - -static unsigned long hyp_idmap_start; -static unsigned long hyp_idmap_end; -static phys_addr_t hyp_idmap_vector; - -static unsigned long io_map_base; - -#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) - -#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) -#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) - -static bool is_iomap(unsigned long flags) -{ - return flags & KVM_S2PTE_FLAG_IS_IOMAP; -} - -static bool memslot_is_logging(struct kvm_memory_slot *memslot) -{ - return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); -} - -/** - * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 - * @kvm: pointer to kvm structure. - * - * Interface to HYP function to flush all VM TLB entries - */ -void kvm_flush_remote_tlbs(struct kvm *kvm) -{ - kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); -} - -static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) -{ - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); -} - -/* - * D-Cache management functions. They take the page table entries by - * value, as they are flushing the cache using the kernel mapping (or - * kmap on 32bit). - */ -static void kvm_flush_dcache_pte(pte_t pte) -{ - __kvm_flush_dcache_pte(pte); -} - -static void kvm_flush_dcache_pmd(pmd_t pmd) -{ - __kvm_flush_dcache_pmd(pmd); -} - -static void kvm_flush_dcache_pud(pud_t pud) -{ - __kvm_flush_dcache_pud(pud); -} - -static bool kvm_is_device_pfn(unsigned long pfn) -{ - return !pfn_valid(pfn); -} - -/** - * stage2_dissolve_pmd() - clear and flush huge PMD entry - * @kvm: pointer to kvm structure. - * @addr: IPA - * @pmd: pmd pointer for IPA - * - * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. - */ -static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) -{ - if (!pmd_thp_or_huge(*pmd)) - return; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - put_page(virt_to_page(pmd)); -} - -/** - * stage2_dissolve_pud() - clear and flush huge PUD entry - * @kvm: pointer to kvm structure. - * @addr: IPA - * @pud: pud pointer for IPA - * - * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. - */ -static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) -{ - if (!stage2_pud_huge(kvm, *pudp)) - return; - - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(kvm, addr); - put_page(virt_to_page(pudp)); -} - -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - int min, int max) -{ - void *page; - - BUG_ON(max > KVM_NR_MEM_OBJS); - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < max) { - page = (void *)__get_free_page(GFP_PGTABLE_USER); - if (!page) - return -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc || !mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; -} - -static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) -{ - pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); - stage2_pgd_clear(kvm, pgd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pud_free(kvm, pud_table); - put_page(virt_to_page(pgd)); -} - -static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) -{ - pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); - VM_BUG_ON(stage2_pud_huge(kvm, *pud)); - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pmd_free(kvm, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - free_page((unsigned long)pte_table); - put_page(virt_to_page(pmd)); -} - -static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) -{ - WRITE_ONCE(*ptep, new_pte); - dsb(ishst); -} - -static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) -{ - WRITE_ONCE(*pmdp, new_pmd); - dsb(ishst); -} - -static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) -{ - kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); -} - -static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) -{ - WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); - dsb(ishst); -} - -static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) -{ - WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); - dsb(ishst); -} - -/* - * Unmapping vs dcache management: - * - * If a guest maps certain memory pages as uncached, all writes will - * bypass the data cache and go directly to RAM. However, the CPUs - * can still speculate reads (not writes) and fill cache lines with - * data. - * - * Those cache lines will be *clean* cache lines though, so a - * clean+invalidate operation is equivalent to an invalidate - * operation, because no cache lines are marked dirty. - * - * Those clean cache lines could be filled prior to an uncached write - * by the guest, and the cache coherent IO subsystem would therefore - * end up writing old data to disk. - * - * This is why right after unmapping a page/section and invalidating - * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure - * the IO subsystem will never hit in the cache. - * - * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as - * we then fully enforce cacheability of RAM, no matter what the guest - * does. - */ -static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t start_addr = addr; - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - pte_t old_pte = *pte; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(kvm, addr); - - /* No need to invalidate the cache for device mappings */ - if (!kvm_is_device_pfn(pte_pfn(old_pte))) - kvm_flush_dcache_pte(old_pte); - - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (stage2_pte_table_empty(kvm, start_pte)) - clear_stage2_pmd_entry(kvm, pmd, start_addr); -} - -static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next, start_addr = addr; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - pmd_t old_pmd = *pmd; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - - kvm_flush_dcache_pmd(old_pmd); - - put_page(virt_to_page(pmd)); - } else { - unmap_stage2_ptes(kvm, pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); - - if (stage2_pmd_table_empty(kvm, start_pmd)) - clear_stage2_pud_entry(kvm, pud, start_addr); -} - -static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next, start_addr = addr; - pud_t *pud, *start_pud; - - start_pud = pud = stage2_pud_offset(kvm, pgd, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - pud_t old_pud = *pud; - - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(kvm, addr); - kvm_flush_dcache_pud(old_pud); - put_page(virt_to_page(pud)); - } else { - unmap_stage2_pmds(kvm, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); - - if (stage2_pud_table_empty(kvm, start_pud)) - clear_stage2_pgd_entry(kvm, pgd, start_addr); -} - -/** - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range - * @kvm: The VM pointer - * @start: The intermediate physical base address of the range to unmap - * @size: The size of the area to unmap - * - * Clear a range of stage-2 mappings, lowering the various ref-counts. Must - * be called while holding mmu_lock (unless for freeing the stage2 pgd before - * destroying the VM), otherwise another faulting VCPU may come in and mess - * with things behind our backs. - */ -static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) -{ - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; - - assert_spin_locked(&kvm->mmu_lock); - WARN_ON(size & ~PAGE_MASK); - - pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Make sure the page table is still active, as another thread - * could have possibly freed the page table, while we released - * the lock. - */ - if (!READ_ONCE(kvm->arch.pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - unmap_stage2_puds(kvm, pgd, addr, next); - /* - * If the range is too large, release the kvm->mmu_lock - * to prevent starvation and lockup detector warnings. - */ - if (next != end) - cond_resched_lock(&kvm->mmu_lock); - } while (pgd++, addr = next, addr != end); -} - -static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) - kvm_flush_dcache_pte(*pte); - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) - kvm_flush_dcache_pmd(*pmd); - else - stage2_flush_ptes(kvm, pmd, addr, next); - } - } while (pmd++, addr = next, addr != end); -} - -static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, pgd, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) - kvm_flush_dcache_pud(*pud); - else - stage2_flush_pmds(kvm, pud, addr, next); - } - } while (pud++, addr = next, addr != end); -} - -static void stage2_flush_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; - phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - phys_addr_t next; - pgd_t *pgd; - - pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); - do { - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - stage2_flush_puds(kvm, pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} - -/** - * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 - * @kvm: The struct kvm pointer - * - * Go through the stage 2 page tables and invalidate any cache lines - * backing memory already mapped to the VM. - */ -static void stage2_flush_vm(struct kvm *kvm) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int idx; - - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); - - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) - stage2_flush_memslot(kvm, memslot); - - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); -} - -static void clear_hyp_pgd_entry(pgd_t *pgd) -{ - pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); - pgd_clear(pgd); - pud_free(NULL, pud_table); - put_page(virt_to_page(pgd)); -} - -static void clear_hyp_pud_entry(pud_t *pud) -{ - pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); - VM_BUG_ON(pud_huge(*pud)); - pud_clear(pud); - pmd_free(NULL, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_hyp_pmd_entry(pmd_t *pmd) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - pte_free_kernel(NULL, pte_table); - put_page(virt_to_page(pmd)); -} - -static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - kvm_set_pte(pte, __pte(0)); - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (hyp_pte_table_empty(start_pte)) - clear_hyp_pmd_entry(pmd); -} - -static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - /* Hyp doesn't use huge pmds */ - if (!pmd_none(*pmd)) - unmap_hyp_ptes(pmd, addr, next); - } while (pmd++, addr = next, addr != end); - - if (hyp_pmd_table_empty(start_pmd)) - clear_hyp_pud_entry(pud); -} - -static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pud_t *pud, *start_pud; - - start_pud = pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - /* Hyp doesn't use huge puds */ - if (!pud_none(*pud)) - unmap_hyp_pmds(pud, addr, next); - } while (pud++, addr = next, addr != end); - - if (hyp_pud_table_empty(start_pud)) - clear_hyp_pgd_entry(pgd); -} - -static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) -{ - return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); -} - -static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, - phys_addr_t start, u64 size) -{ - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; - - /* - * We don't unmap anything from HYP, except at the hyp tear down. - * Hence, we don't have to invalidate the TLBs here. - */ - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - do { - next = pgd_addr_end(addr, end); - if (!pgd_none(*pgd)) - unmap_hyp_puds(pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} - -static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); -} - -static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); -} - -/** - * free_hyp_pgds - free Hyp-mode page tables - * - * Assumes hyp_pgd is a page table used strictly in Hyp-mode and - * therefore contains either mappings in the kernel memory area (above - * PAGE_OFFSET), or device mappings in the idmap range. - * - * boot_hyp_pgd should only map the idmap range, and is only used in - * the extended idmap case. - */ -void free_hyp_pgds(void) -{ - pgd_t *id_pgd; - - mutex_lock(&kvm_hyp_pgd_mutex); - - id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; - - if (id_pgd) { - /* In case we never called hyp_mmu_init() */ - if (!io_map_base) - io_map_base = hyp_idmap_start; - unmap_hyp_idmap_range(id_pgd, io_map_base, - hyp_idmap_start + PAGE_SIZE - io_map_base); - } - - if (boot_hyp_pgd) { - free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); - boot_hyp_pgd = NULL; - } - - if (hyp_pgd) { - unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), - (uintptr_t)high_memory - PAGE_OFFSET); - - free_pages((unsigned long)hyp_pgd, hyp_pgd_order); - hyp_pgd = NULL; - } - if (merged_hyp_pgd) { - clear_page(merged_hyp_pgd); - free_page((unsigned long)merged_hyp_pgd); - merged_hyp_pgd = NULL; - } - - mutex_unlock(&kvm_hyp_pgd_mutex); -} - -static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pte_t *pte; - unsigned long addr; - - addr = start; - do { - pte = pte_offset_kernel(pmd, addr); - kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); - get_page(virt_to_page(pte)); - pfn++; - } while (addr += PAGE_SIZE, addr != end); -} - -static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pmd_t *pmd; - pte_t *pte; - unsigned long addr, next; - - addr = start; - do { - pmd = pmd_offset(pud, addr); - - BUG_ON(pmd_sect(*pmd)); - - if (pmd_none(*pmd)) { - pte = pte_alloc_one_kernel(NULL); - if (!pte) { - kvm_err("Cannot allocate Hyp pte\n"); - return -ENOMEM; - } - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - next = pmd_addr_end(addr, end); - - create_hyp_pte_mappings(pmd, addr, next, pfn, prot); - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pud_t *pud; - pmd_t *pmd; - unsigned long addr, next; - int ret; - - addr = start; - do { - pud = pud_offset(pgd, addr); - - if (pud_none_or_clear_bad(pud)) { - pmd = pmd_alloc_one(NULL, addr); - if (!pmd) { - kvm_err("Cannot allocate Hyp pmd\n"); - return -ENOMEM; - } - kvm_pud_populate(pud, pmd); - get_page(virt_to_page(pud)); - } - - next = pud_addr_end(addr, end); - ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); - if (ret) - return ret; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, - unsigned long start, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - pgd_t *pgd; - pud_t *pud; - unsigned long addr, next; - int err = 0; - - mutex_lock(&kvm_hyp_pgd_mutex); - addr = start & PAGE_MASK; - end = PAGE_ALIGN(end); - do { - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - - if (pgd_none(*pgd)) { - pud = pud_alloc_one(NULL, addr); - if (!pud) { - kvm_err("Cannot allocate Hyp pud\n"); - err = -ENOMEM; - goto out; - } - kvm_pgd_populate(pgd, pud); - get_page(virt_to_page(pgd)); - } - - next = pgd_addr_end(addr, end); - err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); - if (err) - goto out; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); -out: - mutex_unlock(&kvm_hyp_pgd_mutex); - return err; -} - -static phys_addr_t kvm_kaddr_to_phys(void *kaddr) -{ - if (!is_vmalloc_addr(kaddr)) { - BUG_ON(!virt_addr_valid(kaddr)); - return __pa(kaddr); - } else { - return page_to_phys(vmalloc_to_page(kaddr)) + - offset_in_page(kaddr); - } -} - -/** - * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode - * @from: The virtual kernel start address of the range - * @to: The virtual kernel end address of the range (exclusive) - * @prot: The protection to be applied to this range - * - * The same virtual address as the kernel virtual address is also used - * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying - * physical pages. - */ -int create_hyp_mappings(void *from, void *to, pgprot_t prot) -{ - phys_addr_t phys_addr; - unsigned long virt_addr; - unsigned long start = kern_hyp_va((unsigned long)from); - unsigned long end = kern_hyp_va((unsigned long)to); - - if (is_kernel_in_hyp_mode()) - return 0; - - start = start & PAGE_MASK; - end = PAGE_ALIGN(end); - - for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { - int err; - - phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); - err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, - virt_addr, virt_addr + PAGE_SIZE, - __phys_to_pfn(phys_addr), - prot); - if (err) - return err; - } - - return 0; -} - -static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, - unsigned long *haddr, pgprot_t prot) -{ - pgd_t *pgd = hyp_pgd; - unsigned long base; - int ret = 0; - - mutex_lock(&kvm_hyp_pgd_mutex); - - /* - * This assumes that we we have enough space below the idmap - * page to allocate our VAs. If not, the check below will - * kick. A potential alternative would be to detect that - * overflow and switch to an allocation above the idmap. - * - * The allocated size is always a multiple of PAGE_SIZE. - */ - size = PAGE_ALIGN(size + offset_in_page(phys_addr)); - base = io_map_base - size; - - /* - * Verify that BIT(VA_BITS - 1) hasn't been flipped by - * allocating the new area, as it would indicate we've - * overflowed the idmap/IO address range. - */ - if ((base ^ io_map_base) & BIT(VA_BITS - 1)) - ret = -ENOMEM; - else - io_map_base = base; - - mutex_unlock(&kvm_hyp_pgd_mutex); - - if (ret) - goto out; - - if (__kvm_cpu_uses_extended_idmap()) - pgd = boot_hyp_pgd; - - ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - base, base + size, - __phys_to_pfn(phys_addr), prot); - if (ret) - goto out; - - *haddr = base + offset_in_page(phys_addr); - -out: - return ret; -} - -/** - * create_hyp_io_mappings - Map IO into both kernel and HYP - * @phys_addr: The physical start address which gets mapped - * @size: Size of the region being mapped - * @kaddr: Kernel VA for this mapping - * @haddr: HYP VA for this mapping - */ -int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, - void __iomem **kaddr, - void __iomem **haddr) -{ - unsigned long addr; - int ret; - - *kaddr = ioremap(phys_addr, size); - if (!*kaddr) - return -ENOMEM; - - if (is_kernel_in_hyp_mode()) { - *haddr = *kaddr; - return 0; - } - - ret = __create_hyp_private_mapping(phys_addr, size, - &addr, PAGE_HYP_DEVICE); - if (ret) { - iounmap(*kaddr); - *kaddr = NULL; - *haddr = NULL; - return ret; - } - - *haddr = (void __iomem *)addr; - return 0; -} - -/** - * create_hyp_exec_mappings - Map an executable range into HYP - * @phys_addr: The physical start address which gets mapped - * @size: Size of the region being mapped - * @haddr: HYP VA for this mapping - */ -int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, - void **haddr) -{ - unsigned long addr; - int ret; - - BUG_ON(is_kernel_in_hyp_mode()); - - ret = __create_hyp_private_mapping(phys_addr, size, - &addr, PAGE_HYP_EXEC); - if (ret) { - *haddr = NULL; - return ret; - } - - *haddr = (void *)addr; - return 0; -} - -/** - * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. - * @kvm: The KVM struct pointer for the VM. - * - * Allocates only the stage-2 HW PGD level table(s) of size defined by - * stage2_pgd_size(kvm). - * - * Note we don't need locking here as this is only called when the VM is - * created, which can only be done once. - */ -int kvm_alloc_stage2_pgd(struct kvm *kvm) -{ - phys_addr_t pgd_phys; - pgd_t *pgd; - - if (kvm->arch.pgd != NULL) { - kvm_err("kvm_arch already initialized?\n"); - return -EINVAL; - } - - /* Allocate the HW PGD, making sure that each page gets its own refcount */ - pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); - if (!pgd) - return -ENOMEM; - - pgd_phys = virt_to_phys(pgd); - if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) - return -EINVAL; - - kvm->arch.pgd = pgd; - kvm->arch.pgd_phys = pgd_phys; - return 0; -} - -static void stage2_unmap_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - hva_t hva = memslot->userspace_addr; - phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; - phys_addr_t size = PAGE_SIZE * memslot->npages; - hva_t reg_end = hva + size; - - /* - * A memory region could potentially cover multiple VMAs, and any holes - * between them, so iterate over all of them to find out if we should - * unmap any of them. - * - * +--------------------------------------------+ - * +---------------+----------------+ +----------------+ - * | : VMA 1 | VMA 2 | | VMA 3 : | - * +---------------+----------------+ +----------------+ - * | memory region | - * +--------------------------------------------+ - */ - do { - struct vm_area_struct *vma = find_vma(current->mm, hva); - hva_t vm_start, vm_end; - - if (!vma || vma->vm_start >= reg_end) - break; - - /* - * Take the intersection of this VMA with the memory region - */ - vm_start = max(hva, vma->vm_start); - vm_end = min(reg_end, vma->vm_end); - - if (!(vma->vm_flags & VM_PFNMAP)) { - gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - unmap_stage2_range(kvm, gpa, vm_end - vm_start); - } - hva = vm_end; - } while (hva < reg_end); -} - -/** - * stage2_unmap_vm - Unmap Stage-2 RAM mappings - * @kvm: The struct kvm pointer - * - * Go through the memregions and unmap any reguler RAM - * backing memory already mapped to the VM. - */ -void stage2_unmap_vm(struct kvm *kvm) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int idx; - - idx = srcu_read_lock(&kvm->srcu); - down_read(¤t->mm->mmap_sem); - spin_lock(&kvm->mmu_lock); - - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) - stage2_unmap_memslot(kvm, memslot); - - spin_unlock(&kvm->mmu_lock); - up_read(¤t->mm->mmap_sem); - srcu_read_unlock(&kvm->srcu, idx); -} - -/** - * kvm_free_stage2_pgd - free all stage-2 tables - * @kvm: The KVM struct pointer for the VM. - * - * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all - * underlying level-2 and level-3 tables before freeing the actual level-1 table - * and setting the struct pointer to NULL. - */ -void kvm_free_stage2_pgd(struct kvm *kvm) -{ - void *pgd = NULL; - - spin_lock(&kvm->mmu_lock); - if (kvm->arch.pgd) { - unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); - pgd = READ_ONCE(kvm->arch.pgd); - kvm->arch.pgd = NULL; - kvm->arch.pgd_phys = 0; - } - spin_unlock(&kvm->mmu_lock); - - /* Free the HW pgd, one page at a time */ - if (pgd) - free_pages_exact(pgd, stage2_pgd_size(kvm)); -} - -static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - pgd_t *pgd; - pud_t *pud; - - pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); - if (stage2_pgd_none(kvm, *pgd)) { - if (!cache) - return NULL; - pud = mmu_memory_cache_alloc(cache); - stage2_pgd_populate(kvm, pgd, pud); - get_page(virt_to_page(pgd)); - } - - return stage2_pud_offset(kvm, pgd, addr); -} - -static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - pud_t *pud; - pmd_t *pmd; - - pud = stage2_get_pud(kvm, cache, addr); - if (!pud || stage2_pud_huge(kvm, *pud)) - return NULL; - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return NULL; - pmd = mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - return stage2_pmd_offset(kvm, pud, addr); -} - -static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache - *cache, phys_addr_t addr, const pmd_t *new_pmd) -{ - pmd_t *pmd, old_pmd; - -retry: - pmd = stage2_get_pmd(kvm, cache, addr); - VM_BUG_ON(!pmd); - - old_pmd = *pmd; - /* - * Multiple vcpus faulting on the same PMD entry, can - * lead to them sequentially updating the PMD with the - * same value. Following the break-before-make - * (pmd_clear() followed by tlb_flush()) process can - * hinder forward progress due to refaults generated - * on missing translations. - * - * Skip updating the page table if the entry is - * unchanged. - */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - - if (pmd_present(old_pmd)) { - /* - * If we already have PTE level mapping for this block, - * we must unmap it to avoid inconsistent TLB state and - * leaking the table page. We could end up in this situation - * if the memory slot was marked for dirty logging and was - * reverted, leaving PTE level mappings for the pages accessed - * during the period. So, unmap the PTE level mapping for this - * block and retry, as we could have released the upper level - * table in the process. - * - * Normal THP split/merge follows mmu_notifier callbacks and do - * get handled accordingly. - */ - if (!pmd_thp_or_huge(old_pmd)) { - unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); - goto retry; - } - /* - * Mapping in huge pages should only happen through a - * fault. If a page is merged into a transparent huge - * page, the individual subpages of that huge page - * should be unmapped through MMU notifiers before we - * get here. - * - * Merging of CompoundPages is not supported; they - * should become splitting first, unmapped, merged, - * and mapped back in on-demand. - */ - WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - } else { - get_page(virt_to_page(pmd)); - } - - kvm_set_pmd(pmd, *new_pmd); - return 0; -} - -static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pud_t *new_pudp) -{ - pud_t *pudp, old_pud; - -retry: - pudp = stage2_get_pud(kvm, cache, addr); - VM_BUG_ON(!pudp); - - old_pud = *pudp; - - /* - * A large number of vcpus faulting on the same stage 2 entry, - * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). - * Skip updating the page tables if there is no change. - */ - if (pud_val(old_pud) == pud_val(*new_pudp)) - return 0; - - if (stage2_pud_present(kvm, old_pud)) { - /* - * If we already have table level mapping for this block, unmap - * the range for this block and retry. - */ - if (!stage2_pud_huge(kvm, old_pud)) { - unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); - goto retry; - } - - WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(kvm, addr); - } else { - get_page(virt_to_page(pudp)); - } - - kvm_set_pud(pudp, *new_pudp); - return 0; -} - -/* - * stage2_get_leaf_entry - walk the stage2 VM page tables and return - * true if a valid and present leaf-entry is found. A pointer to the - * leaf-entry is returned in the appropriate level variable - pudpp, - * pmdpp, ptepp. - */ -static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, - pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) -{ - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - *pudpp = NULL; - *pmdpp = NULL; - *ptepp = NULL; - - pudp = stage2_get_pud(kvm, NULL, addr); - if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) - return false; - - if (stage2_pud_huge(kvm, *pudp)) { - *pudpp = pudp; - return true; - } - - pmdp = stage2_pmd_offset(kvm, pudp, addr); - if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) - return false; - - if (pmd_thp_or_huge(*pmdp)) { - *pmdpp = pmdp; - return true; - } - - ptep = pte_offset_kernel(pmdp, addr); - if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) - return false; - - *ptepp = ptep; - return true; -} - -static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) -{ - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - bool found; - - found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); - if (!found) - return false; - - if (pudp) - return kvm_s2pud_exec(pudp); - else if (pmdp) - return kvm_s2pmd_exec(pmdp); - else - return kvm_s2pte_exec(ptep); -} - -static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, - unsigned long flags) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte, old_pte; - bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; - bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; - - VM_BUG_ON(logging_active && !cache); - - /* Create stage-2 page table mapping - Levels 0 and 1 */ - pud = stage2_get_pud(kvm, cache, addr); - if (!pud) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PUD, then continue - * on to allocate page. - */ - if (logging_active) - stage2_dissolve_pud(kvm, addr, pud); - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pmd = mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - pmd = stage2_pmd_offset(kvm, pud, addr); - if (!pmd) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PMD, then continue on to - * allocate page. - */ - if (logging_active) - stage2_dissolve_pmd(kvm, addr, pmd); - - /* Create stage-2 page mappings - Level 2 */ - if (pmd_none(*pmd)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pte = mmu_memory_cache_alloc(cache); - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - pte = pte_offset_kernel(pmd, addr); - - if (iomap && pte_present(*pte)) - return -EFAULT; - - /* Create 2nd stage page table mapping - Level 3 */ - old_pte = *pte; - if (pte_present(old_pte)) { - /* Skip page table update if there is no change */ - if (pte_val(old_pte) == pte_val(*new_pte)) - return 0; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(kvm, addr); - } else { - get_page(virt_to_page(pte)); - } - - kvm_set_pte(pte, *new_pte); - return 0; -} - -#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - if (pte_young(*pte)) { - *pte = pte_mkold(*pte); - return 1; - } - return 0; -} -#else -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - return __ptep_test_and_clear_young(pte); -} -#endif - -static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pmd); -} - -static int stage2_pudp_test_and_clear_young(pud_t *pud) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pud); -} - -/** - * kvm_phys_addr_ioremap - map a device range to guest IPA - * - * @kvm: The KVM pointer - * @guest_ipa: The IPA at which to insert the mapping - * @pa: The physical address of the device - * @size: The size of the mapping - */ -int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, - phys_addr_t pa, unsigned long size, bool writable) -{ - phys_addr_t addr, end; - int ret = 0; - unsigned long pfn; - struct kvm_mmu_memory_cache cache = { 0, }; - - end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; - pfn = __phys_to_pfn(pa); - - for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { - pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); - - if (writable) - pte = kvm_s2pte_mkwrite(pte); - - ret = mmu_topup_memory_cache(&cache, - kvm_mmu_cache_min_pages(kvm), - KVM_NR_MEM_OBJS); - if (ret) - goto out; - spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(kvm, &cache, addr, &pte, - KVM_S2PTE_FLAG_IS_IOMAP); - spin_unlock(&kvm->mmu_lock); - if (ret) - goto out; - - pfn++; - } - -out: - mmu_free_memory_cache(&cache); - return ret; -} - -static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) -{ - kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *ipap >> PAGE_SHIFT; - - if (kvm_is_transparent_hugepage(pfn)) { - unsigned long mask; - /* - * The address we faulted on is backed by a transparent huge - * page. However, because we map the compound huge page and - * not the individual tail page, we need to transfer the - * refcount to the head page. We have to be careful that the - * THP doesn't start to split while we are adjusting the - * refcounts. - * - * We are sure this doesn't happen, because mmu_notifier_retry - * was successful and we are holding the mmu_lock, so if this - * THP is trying to split, it will be blocked in the mmu - * notifier before touching any of the pages, specifically - * before being able to call __split_huge_page_refcount(). - * - * We can therefore safely transfer the refcount from PG_tail - * to PG_head and switch the pfn from a tail page to the head - * page accordingly. - */ - mask = PTRS_PER_PMD - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - *ipap &= PMD_MASK; - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } - - return true; - } - - return false; -} - -/** - * stage2_wp_ptes - write protect PMD range - * @pmd: pointer to pmd entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - if (!kvm_s2pte_readonly(pte)) - kvm_set_s2pte_readonly(pte); - } - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -/** - * stage2_wp_pmds - write protect PUD range - * kvm: kvm instance for the VM - * @pud: pointer to pud entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - if (!kvm_s2pmd_readonly(pmd)) - kvm_set_s2pmd_readonly(pmd); - } else { - stage2_wp_ptes(pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); -} - -/** - * stage2_wp_puds - write protect PGD range - * @pgd: pointer to pgd entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, pgd, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - if (!kvm_s2pud_readonly(pud)) - kvm_set_s2pud_readonly(pud); - } else { - stage2_wp_pmds(kvm, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); -} - -/** - * stage2_wp_range() - write protect stage2 memory region range - * @kvm: The KVM pointer - * @addr: Start address of range - * @end: End address of range - */ -static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - pgd_t *pgd; - phys_addr_t next; - - pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Release kvm_mmu_lock periodically if the memory region is - * large. Otherwise, we may see kernel panics with - * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, - * CONFIG_LOCKDEP. Additionally, holding the lock too long - * will also starve other vCPUs. We have to also make sure - * that the page tables are not freed while we released - * the lock. - */ - cond_resched_lock(&kvm->mmu_lock); - if (!READ_ONCE(kvm->arch.pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (stage2_pgd_present(kvm, *pgd)) - stage2_wp_puds(kvm, pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} - -/** - * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot - * @kvm: The KVM pointer - * @slot: The memory slot to write protect - * - * Called to start logging dirty pages after memory region - * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns - * all present PUD, PMD and PTEs are write protected in the memory region. - * Afterwards read of dirty page log can be called. - * - * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, - * serializing operations for VM memory regions. - */ -void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) -{ - struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); - phys_addr_t start, end; - - if (WARN_ON_ONCE(!memslot)) - return; - - start = memslot->base_gfn << PAGE_SHIFT; - end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; - - spin_lock(&kvm->mmu_lock); - stage2_wp_range(kvm, start, end); - spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); -} - -/** - * kvm_mmu_write_protect_pt_masked() - write protect dirty pages - * @kvm: The KVM pointer - * @slot: The memory slot associated with mask - * @gfn_offset: The gfn offset in memory slot - * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory - * slot to be write protected - * - * Walks bits set in mask write protects the associated pte's. Caller must - * acquire kvm_mmu_lock. - */ -static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) -{ - phys_addr_t base_gfn = slot->base_gfn + gfn_offset; - phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; - phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; - - stage2_wp_range(kvm, start, end); -} - -/* - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * dirty pages. - * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. - */ -void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) -{ - kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); -} - -static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) -{ - __clean_dcache_guest_page(pfn, size); -} - -static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) -{ - __invalidate_icache_guest_page(pfn, size); -} - -static void kvm_send_hwpoison_signal(unsigned long address, short lsb) -{ - send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); -} - -static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, - unsigned long hva, - unsigned long map_size) -{ - gpa_t gpa_start; - hva_t uaddr_start, uaddr_end; - size_t size; - - size = memslot->npages * PAGE_SIZE; - - gpa_start = memslot->base_gfn << PAGE_SHIFT; - - uaddr_start = memslot->userspace_addr; - uaddr_end = uaddr_start + size; - - /* - * Pages belonging to memslots that don't have the same alignment - * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 - * PMD/PUD entries, because we'll end up mapping the wrong pages. - * - * Consider a layout like the following: - * - * memslot->userspace_addr: - * +-----+--------------------+--------------------+---+ - * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| - * +-----+--------------------+--------------------+---+ - * - * memslot->base_gfn << PAGE_SIZE: - * +---+--------------------+--------------------+-----+ - * |abc|def Stage-2 block | Stage-2 block |tvxyz| - * +---+--------------------+--------------------+-----+ - * - * If we create those stage-2 blocks, we'll end up with this incorrect - * mapping: - * d -> f - * e -> g - * f -> h - */ - if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) - return false; - - /* - * Next, let's make sure we're not trying to map anything not covered - * by the memslot. This means we have to prohibit block size mappings - * for the beginning and end of a non-block aligned and non-block sized - * memory slot (illustrated by the head and tail parts of the - * userspace view above containing pages 'abcde' and 'xyz', - * respectively). - * - * Note that it doesn't matter if we do the check using the - * userspace_addr or the base_gfn, as both are equally aligned (per - * the check above) and equally sized. - */ - return (hva & ~(map_size - 1)) >= uaddr_start && - (hva & ~(map_size - 1)) + map_size <= uaddr_end; -} - -static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_memory_slot *memslot, unsigned long hva, - unsigned long fault_status) -{ - int ret; - bool write_fault, writable, force_pte = false; - bool exec_fault, needs_exec; - unsigned long mmu_seq; - gfn_t gfn = fault_ipa >> PAGE_SHIFT; - struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; - struct vm_area_struct *vma; - short vma_shift; - kvm_pfn_t pfn; - pgprot_t mem_type = PAGE_S2; - bool logging_active = memslot_is_logging(memslot); - unsigned long vma_pagesize, flags = 0; - - write_fault = kvm_is_write_fault(vcpu); - exec_fault = kvm_vcpu_trap_is_iabt(vcpu); - VM_BUG_ON(write_fault && exec_fault); - - if (fault_status == FSC_PERM && !write_fault && !exec_fault) { - kvm_err("Unexpected L2 read permission error\n"); - return -EFAULT; - } - - /* Let's check if we will get back a huge page backed by hugetlbfs */ - down_read(¤t->mm->mmap_sem); - vma = find_vma_intersection(current->mm, hva, hva + 1); - if (unlikely(!vma)) { - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); - up_read(¤t->mm->mmap_sem); - return -EFAULT; - } - - if (is_vm_hugetlb_page(vma)) - vma_shift = huge_page_shift(hstate_vma(vma)); - else - vma_shift = PAGE_SHIFT; - - vma_pagesize = 1ULL << vma_shift; - if (logging_active || - (vma->vm_flags & VM_PFNMAP) || - !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { - force_pte = true; - vma_pagesize = PAGE_SIZE; - } - - /* - * The stage2 has a minimum of 2 level table (For arm64 see - * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can - * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). - * As for PUD huge maps, we must make sure that we have at least - * 3 levels, i.e, PMD is not folded. - */ - if (vma_pagesize == PMD_SIZE || - (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) - gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; - up_read(¤t->mm->mmap_sem); - - /* We need minimum second+third level pages */ - ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), - KVM_NR_MEM_OBJS); - if (ret) - return ret; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* - * Ensure the read of mmu_notifier_seq happens before we call - * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk - * the page we just got a reference to gets unmapped before we have a - * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_hva will take it away - * from us again properly. This smp_rmb() interacts with the smp_wmb() - * in kvm_mmu_notifier_invalidate_. - */ - smp_rmb(); - - pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(hva, vma_shift); - return 0; - } - if (is_error_noslot_pfn(pfn)) - return -EFAULT; - - if (kvm_is_device_pfn(pfn)) { - mem_type = PAGE_S2_DEVICE; - flags |= KVM_S2PTE_FLAG_IS_IOMAP; - } else if (logging_active) { - /* - * Faults on pages in a memslot with logging enabled - * should not be mapped with huge pages (it introduces churn - * and performance degradation), so force a pte mapping. - */ - flags |= KVM_S2_FLAG_LOGGING_ACTIVE; - - /* - * Only actually map the page as writable if this was a write - * fault. - */ - if (!write_fault) - writable = false; - } - - if (exec_fault && is_iomap(flags)) - return -ENOEXEC; - - spin_lock(&kvm->mmu_lock); - if (mmu_notifier_retry(kvm, mmu_seq)) - goto out_unlock; - - if (vma_pagesize == PAGE_SIZE && !force_pte) { - /* - * Only PMD_SIZE transparent hugepages(THP) are - * currently supported. This code will need to be - * updated to support other THP sizes. - * - * Make sure the host VA and the guest IPA are sufficiently - * aligned and that the block is contained within the memslot. - */ - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && - transparent_hugepage_adjust(&pfn, &fault_ipa)) - vma_pagesize = PMD_SIZE; - } - - if (writable) - kvm_set_pfn_dirty(pfn); - - if (fault_status != FSC_PERM && !is_iomap(flags)) - clean_dcache_guest_page(pfn, vma_pagesize); - - if (exec_fault) - invalidate_icache_guest_page(pfn, vma_pagesize); - - /* - * If we took an execution fault we have made the - * icache/dcache coherent above and should now let the s2 - * mapping be executable. - * - * Write faults (!exec_fault && FSC_PERM) are orthogonal to - * execute permissions, and we preserve whatever we have. - */ - needs_exec = exec_fault || - (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); - - if (vma_pagesize == PUD_SIZE) { - pud_t new_pud = kvm_pfn_pud(pfn, mem_type); - - new_pud = kvm_pud_mkhuge(new_pud); - if (writable) - new_pud = kvm_s2pud_mkwrite(new_pud); - - if (needs_exec) - new_pud = kvm_s2pud_mkexec(new_pud); - - ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); - } else if (vma_pagesize == PMD_SIZE) { - pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); - - new_pmd = kvm_pmd_mkhuge(new_pmd); - - if (writable) - new_pmd = kvm_s2pmd_mkwrite(new_pmd); - - if (needs_exec) - new_pmd = kvm_s2pmd_mkexec(new_pmd); - - ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); - } else { - pte_t new_pte = kvm_pfn_pte(pfn, mem_type); - - if (writable) { - new_pte = kvm_s2pte_mkwrite(new_pte); - mark_page_dirty(kvm, gfn); - } - - if (needs_exec) - new_pte = kvm_s2pte_mkexec(new_pte); - - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); - } - -out_unlock: - spin_unlock(&kvm->mmu_lock); - kvm_set_pfn_accessed(pfn); - kvm_release_pfn_clean(pfn); - return ret; -} - -/* - * Resolve the access fault by making the page young again. - * Note that because the faulting entry is guaranteed not to be - * cached in the TLB, we don't need to invalidate anything. - * Only the HW Access Flag updates are supported for Stage 2 (no DBM), - * so there is no need for atomic (pte|pmd)_mkyoung operations. - */ -static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - kvm_pfn_t pfn; - bool pfn_valid = false; - - trace_kvm_access_fault(fault_ipa); - - spin_lock(&vcpu->kvm->mmu_lock); - - if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) - goto out; - - if (pud) { /* HugeTLB */ - *pud = kvm_s2pud_mkyoung(*pud); - pfn = kvm_pud_pfn(*pud); - pfn_valid = true; - } else if (pmd) { /* THP, HugeTLB */ - *pmd = pmd_mkyoung(*pmd); - pfn = pmd_pfn(*pmd); - pfn_valid = true; - } else { - *pte = pte_mkyoung(*pte); /* Just a page... */ - pfn = pte_pfn(*pte); - pfn_valid = true; - } - -out: - spin_unlock(&vcpu->kvm->mmu_lock); - if (pfn_valid) - kvm_set_pfn_accessed(pfn); -} - -/** - * kvm_handle_guest_abort - handles all 2nd stage aborts - * @vcpu: the VCPU pointer - * @run: the kvm_run structure - * - * Any abort that gets to the host is almost guaranteed to be caused by a - * missing second stage translation table entry, which can mean that either the - * guest simply needs more memory and we must allocate an appropriate page or it - * can mean that the guest tried to access I/O memory, which is emulated by user - * space. The distinction is based on the IPA causing the fault and whether this - * memory region has been registered as standard RAM by user space. - */ -int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - unsigned long fault_status; - phys_addr_t fault_ipa; - struct kvm_memory_slot *memslot; - unsigned long hva; - bool is_iabt, write_fault, writable; - gfn_t gfn; - int ret, idx; - - fault_status = kvm_vcpu_trap_get_fault_type(vcpu); - - fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); - is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - - /* Synchronous External Abort? */ - if (kvm_vcpu_dabt_isextabt(vcpu)) { - /* - * For RAS the host kernel may handle this abort. - * There is no need to pass the error into the guest. - */ - if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) - return 1; - - if (unlikely(!is_iabt)) { - kvm_inject_vabt(vcpu); - return 1; - } - } - - trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), - kvm_vcpu_get_hfar(vcpu), fault_ipa); - - /* Check the stage-2 fault is trans. fault or write fault */ - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && - fault_status != FSC_ACCESS) { - kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", - kvm_vcpu_trap_get_class(vcpu), - (unsigned long)kvm_vcpu_trap_get_fault(vcpu), - (unsigned long)kvm_vcpu_get_hsr(vcpu)); - return -EFAULT; - } - - idx = srcu_read_lock(&vcpu->kvm->srcu); - - gfn = fault_ipa >> PAGE_SHIFT; - memslot = gfn_to_memslot(vcpu->kvm, gfn); - hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); - write_fault = kvm_is_write_fault(vcpu); - if (kvm_is_error_hva(hva) || (write_fault && !writable)) { - if (is_iabt) { - /* Prefetch Abort on I/O address */ - ret = -ENOEXEC; - goto out; - } - - /* - * Check for a cache maintenance operation. Since we - * ended-up here, we know it is outside of any memory - * slot. But we can't find out if that is for a device, - * or if the guest is just being stupid. The only thing - * we know for sure is that this range cannot be cached. - * - * So let's assume that the guest is just being - * cautious, and skip the instruction. - */ - if (kvm_vcpu_dabt_is_cm(vcpu)) { - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - ret = 1; - goto out_unlock; - } - - /* - * The IPA is reported as [MAX:12], so we need to - * complement it with the bottom 12 bits from the - * faulting VA. This is always 12 bits, irrespective - * of the page size. - */ - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, run, fault_ipa); - goto out_unlock; - } - - /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); - - if (fault_status == FSC_ACCESS) { - handle_access_fault(vcpu, fault_ipa); - ret = 1; - goto out_unlock; - } - - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); - if (ret == 0) - ret = 1; -out: - if (ret == -ENOEXEC) { - kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - ret = 1; - } -out_unlock: - srcu_read_unlock(&vcpu->kvm->srcu, idx); - return ret; -} - -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, - gpa_t gpa, u64 size, - void *data), - void *data) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gpa; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; - ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); - } - - return ret; -} - -static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - unmap_stage2_range(kvm, gpa, size); - return 0; -} - -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end) -{ - if (!kvm->arch.pgd) - return 0; - - trace_kvm_unmap_hva_range(start, end); - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); - return 0; -} - -static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - pte_t *pte = (pte_t *)data; - - WARN_ON(size != PAGE_SIZE); - /* - * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE - * flag clear because MMU notifiers will have unmapped a huge PMD before - * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and - * therefore stage2_set_pte() never needs to clear out a huge PMD - * through this calling path. - */ - stage2_set_pte(kvm, NULL, gpa, pte, 0); - return 0; -} - - -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) -{ - unsigned long end = hva + PAGE_SIZE; - kvm_pfn_t pfn = pte_pfn(pte); - pte_t stage2_pte; - - if (!kvm->arch.pgd) - return 0; - - trace_kvm_set_spte_hva(hva); - - /* - * We've moved a page around, probably through CoW, so let's treat it - * just like a translation fault and clean the cache to the PoC. - */ - clean_dcache_guest_page(pfn, PAGE_SIZE); - stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); - - return 0; -} - -static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return stage2_pudp_test_and_clear_young(pud); - else if (pmd) - return stage2_pmdp_test_and_clear_young(pmd); - else - return stage2_ptep_test_and_clear_young(pte); -} - -static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return kvm_s2pud_young(*pud); - else if (pmd) - return pmd_young(*pmd); - else - return pte_young(*pte); -} - -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) -{ - if (!kvm->arch.pgd) - return 0; - trace_kvm_age_hva(start, end); - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); -} - -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - if (!kvm->arch.pgd) - return 0; - trace_kvm_test_age_hva(hva); - return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, - kvm_test_age_hva_handler, NULL); -} - -void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) -{ - mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); -} - -phys_addr_t kvm_mmu_get_httbr(void) -{ - if (__kvm_cpu_uses_extended_idmap()) - return virt_to_phys(merged_hyp_pgd); - else - return virt_to_phys(hyp_pgd); -} - -phys_addr_t kvm_get_idmap_vector(void) -{ - return hyp_idmap_vector; -} - -static int kvm_map_idmap_text(pgd_t *pgd) -{ - int err; - - /* Create the idmap in the boot page tables */ - err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - hyp_idmap_start, hyp_idmap_end, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP_EXEC); - if (err) - kvm_err("Failed to idmap %lx-%lx\n", - hyp_idmap_start, hyp_idmap_end); - - return err; -} - -int kvm_mmu_init(void) -{ - int err; - - hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); - hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); - hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); - hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); - hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); - - /* - * We rely on the linker script to ensure at build time that the HYP - * init code does not cross a page boundary. - */ - BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); - - kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); - kvm_debug("HYP VA range: %lx:%lx\n", - kern_hyp_va(PAGE_OFFSET), - kern_hyp_va((unsigned long)high_memory - 1)); - - if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && - hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && - hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { - /* - * The idmap page is intersecting with the VA space, - * it is not safe to continue further. - */ - kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); - err = -EINVAL; - goto out; - } - - hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); - if (!hyp_pgd) { - kvm_err("Hyp mode PGD not allocated\n"); - err = -ENOMEM; - goto out; - } - - if (__kvm_cpu_uses_extended_idmap()) { - boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - hyp_pgd_order); - if (!boot_hyp_pgd) { - kvm_err("Hyp boot PGD not allocated\n"); - err = -ENOMEM; - goto out; - } - - err = kvm_map_idmap_text(boot_hyp_pgd); - if (err) - goto out; - - merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - if (!merged_hyp_pgd) { - kvm_err("Failed to allocate extra HYP pgd\n"); - goto out; - } - __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, - hyp_idmap_start); - } else { - err = kvm_map_idmap_text(hyp_pgd); - if (err) - goto out; - } - - io_map_base = hyp_idmap_start; - return 0; -out: - free_hyp_pgds(); - return err; -} - -void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot *old, - const struct kvm_memory_slot *new, - enum kvm_mr_change change) -{ - /* - * At this point memslot has been committed and there is an - * allocated dirty_bitmap[], dirty pages will be be tracked while the - * memory slot is write protected. - */ - if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) - kvm_mmu_wp_memory_region(kvm, mem->slot); -} - -int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem, - enum kvm_mr_change change) -{ - hva_t hva = mem->userspace_addr; - hva_t reg_end = hva + mem->memory_size; - bool writable = !(mem->flags & KVM_MEM_READONLY); - int ret = 0; - - if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && - change != KVM_MR_FLAGS_ONLY) - return 0; - - /* - * Prevent userspace from creating a memory region outside of the IPA - * space addressable by the KVM guest IPA space. - */ - if (memslot->base_gfn + memslot->npages >= - (kvm_phys_size(kvm) >> PAGE_SHIFT)) - return -EFAULT; - - down_read(¤t->mm->mmap_sem); - /* - * A memory region could potentially cover multiple VMAs, and any holes - * between them, so iterate over all of them to find out if we can map - * any of them right now. - * - * +--------------------------------------------+ - * +---------------+----------------+ +----------------+ - * | : VMA 1 | VMA 2 | | VMA 3 : | - * +---------------+----------------+ +----------------+ - * | memory region | - * +--------------------------------------------+ - */ - do { - struct vm_area_struct *vma = find_vma(current->mm, hva); - hva_t vm_start, vm_end; - - if (!vma || vma->vm_start >= reg_end) - break; - - /* - * Take the intersection of this VMA with the memory region - */ - vm_start = max(hva, vma->vm_start); - vm_end = min(reg_end, vma->vm_end); - - if (vma->vm_flags & VM_PFNMAP) { - gpa_t gpa = mem->guest_phys_addr + - (vm_start - mem->userspace_addr); - phys_addr_t pa; - - pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; - pa += vm_start - vma->vm_start; - - /* IO region dirty page logging not allowed */ - if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { - ret = -EINVAL; - goto out; - } - - ret = kvm_phys_addr_ioremap(kvm, gpa, pa, - vm_end - vm_start, - writable); - if (ret) - break; - } - hva = vm_end; - } while (hva < reg_end); - - if (change == KVM_MR_FLAGS_ONLY) - goto out; - - spin_lock(&kvm->mmu_lock); - if (ret) - unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); - else - stage2_flush_memslot(kvm, memslot); - spin_unlock(&kvm->mmu_lock); -out: - up_read(¤t->mm->mmap_sem); - return ret; -} - -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) -{ -} - -void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) -{ -} - -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_free_stage2_pgd(kvm); -} - -void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - gpa_t gpa = slot->base_gfn << PAGE_SHIFT; - phys_addr_t size = slot->npages << PAGE_SHIFT; - - spin_lock(&kvm->mmu_lock); - unmap_stage2_range(kvm, gpa, size); - spin_unlock(&kvm->mmu_lock); -} - -/* - * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). - * - * Main problems: - * - S/W ops are local to a CPU (not broadcast) - * - We have line migration behind our back (speculation) - * - System caches don't support S/W at all (damn!) - * - * In the face of the above, the best we can do is to try and convert - * S/W ops to VA ops. Because the guest is not allowed to infer the - * S/W to PA mapping, it can only use S/W to nuke the whole cache, - * which is a rather good thing for us. - * - * Also, it is only used when turning caches on/off ("The expected - * usage of the cache maintenance instructions that operate by set/way - * is associated with the cache maintenance instructions associated - * with the powerdown and powerup of caches, if this is required by - * the implementation."). - * - * We use the following policy: - * - * - If we trap a S/W operation, we enable VM trapping to detect - * caches being turned on/off, and do a full clean. - * - * - We flush the caches on both caches being turned on and off. - * - * - Once the caches are enabled, we stop trapping VM ops. - */ -void kvm_set_way_flush(struct kvm_vcpu *vcpu) -{ - unsigned long hcr = *vcpu_hcr(vcpu); - - /* - * If this is the first time we do a S/W operation - * (i.e. HCR_TVM not set) flush the whole memory, and set the - * VM trapping. - * - * Otherwise, rely on the VM trapping to wait for the MMU + - * Caches to be turned off. At that point, we'll be able to - * clean the caches again. - */ - if (!(hcr & HCR_TVM)) { - trace_kvm_set_way_flush(*vcpu_pc(vcpu), - vcpu_has_cache_enabled(vcpu)); - stage2_flush_vm(vcpu->kvm); - *vcpu_hcr(vcpu) = hcr | HCR_TVM; - } -} - -void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) -{ - bool now_enabled = vcpu_has_cache_enabled(vcpu); - - /* - * If switching the MMU+caches on, need to invalidate the caches. - * If switching it off, need to clean the caches. - * Clean + invalidate does the trick always. - */ - if (now_enabled != was_enabled) - stage2_flush_vm(vcpu->kvm); - - /* Caches are now on, stop trapping VM ops (until a S/W op) */ - if (now_enabled) - *vcpu_hcr(vcpu) &= ~HCR_TVM; - - trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); -} diff --git a/virt/kvm/arm/perf.c b/virt/kvm/arm/perf.c deleted file mode 100644 index d45b8b9a4415..000000000000 --- a/virt/kvm/arm/perf.c +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Based on the x86 implementation. - * - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - */ - -#include -#include - -#include - -static int kvm_is_in_guest(void) -{ - return kvm_get_running_vcpu() != NULL; -} - -static int kvm_is_user_mode(void) -{ - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_running_vcpu(); - - if (vcpu) - return !vcpu_mode_priv(vcpu); - - return 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_running_vcpu(); - - if (vcpu) - return *vcpu_pc(vcpu); - - return 0; -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, -}; - -int kvm_perf_init(void) -{ - return perf_register_guest_info_callbacks(&kvm_guest_cbs); -} - -int kvm_perf_teardown(void) -{ - return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); -} diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c deleted file mode 100644 index f0d0312c0a55..000000000000 --- a/virt/kvm/arm/pmu.c +++ /dev/null @@ -1,869 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015 Linaro Ltd. - * Author: Shannon Zhao - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); -static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx); -static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); - -#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 - -/** - * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter - * @vcpu: The vcpu pointer - * @select_idx: The counter index - */ -static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx) -{ - return (select_idx == ARMV8_PMU_CYCLE_IDX && - __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC); -} - -static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu; - struct kvm_vcpu_arch *vcpu_arch; - - pmc -= pmc->idx; - pmu = container_of(pmc, struct kvm_pmu, pmc[0]); - vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu); - return container_of(vcpu_arch, struct kvm_vcpu, arch); -} - -/** - * kvm_pmu_pmc_is_chained - determine if the pmc is chained - * @pmc: The PMU counter pointer - */ -static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc) -{ - struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - - return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); -} - -/** - * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter - * @select_idx: The counter index - */ -static bool kvm_pmu_idx_is_high_counter(u64 select_idx) -{ - return select_idx & 0x1; -} - -/** - * kvm_pmu_get_canonical_pmc - obtain the canonical pmc - * @pmc: The PMU counter pointer - * - * When a pair of PMCs are chained together we use the low counter (canonical) - * to hold the underlying perf event. - */ -static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc) -{ - if (kvm_pmu_pmc_is_chained(pmc) && - kvm_pmu_idx_is_high_counter(pmc->idx)) - return pmc - 1; - - return pmc; -} -static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc) -{ - if (kvm_pmu_idx_is_high_counter(pmc->idx)) - return pmc - 1; - else - return pmc + 1; -} - -/** - * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain - * @vcpu: The vcpu pointer - * @select_idx: The counter index - */ -static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx) -{ - u64 eventsel, reg; - - select_idx |= 0x1; - - if (select_idx == ARMV8_PMU_CYCLE_IDX) - return false; - - reg = PMEVTYPER0_EL0 + select_idx; - eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT; - - return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN; -} - -/** - * kvm_pmu_get_pair_counter_value - get PMU counter value - * @vcpu: The vcpu pointer - * @pmc: The PMU counter pointer - */ -static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu, - struct kvm_pmc *pmc) -{ - u64 counter, counter_high, reg, enabled, running; - - if (kvm_pmu_pmc_is_chained(pmc)) { - pmc = kvm_pmu_get_canonical_pmc(pmc); - reg = PMEVCNTR0_EL0 + pmc->idx; - - counter = __vcpu_sys_reg(vcpu, reg); - counter_high = __vcpu_sys_reg(vcpu, reg + 1); - - counter = lower_32_bits(counter) | (counter_high << 32); - } else { - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; - counter = __vcpu_sys_reg(vcpu, reg); - } - - /* - * The real counter value is equal to the value of counter register plus - * the value perf event counts. - */ - if (pmc->perf_event) - counter += perf_event_read_value(pmc->perf_event, &enabled, - &running); - - return counter; -} - -/** - * kvm_pmu_get_counter_value - get PMU counter value - * @vcpu: The vcpu pointer - * @select_idx: The counter index - */ -u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) -{ - u64 counter; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; - - counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); - - if (kvm_pmu_pmc_is_chained(pmc) && - kvm_pmu_idx_is_high_counter(select_idx)) - counter = upper_32_bits(counter); - else if (select_idx != ARMV8_PMU_CYCLE_IDX) - counter = lower_32_bits(counter); - - return counter; -} - -/** - * kvm_pmu_set_counter_value - set PMU counter value - * @vcpu: The vcpu pointer - * @select_idx: The counter index - * @val: The counter value - */ -void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) -{ - u64 reg; - - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; - __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); - - /* Recreate the perf event to reflect the updated sample_period */ - kvm_pmu_create_perf_event(vcpu, select_idx); -} - -/** - * kvm_pmu_release_perf_event - remove the perf event - * @pmc: The PMU counter pointer - */ -static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) -{ - pmc = kvm_pmu_get_canonical_pmc(pmc); - if (pmc->perf_event) { - perf_event_disable(pmc->perf_event); - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - } -} - -/** - * kvm_pmu_stop_counter - stop PMU counter - * @pmc: The PMU counter pointer - * - * If this counter has been configured to monitor some event, release it here. - */ -static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) -{ - u64 counter, reg, val; - - pmc = kvm_pmu_get_canonical_pmc(pmc); - if (!pmc->perf_event) - return; - - counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); - - if (pmc->idx == ARMV8_PMU_CYCLE_IDX) { - reg = PMCCNTR_EL0; - val = counter; - } else { - reg = PMEVCNTR0_EL0 + pmc->idx; - val = lower_32_bits(counter); - } - - __vcpu_sys_reg(vcpu, reg) = val; - - if (kvm_pmu_pmc_is_chained(pmc)) - __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter); - - kvm_pmu_release_perf_event(pmc); -} - -/** - * kvm_pmu_vcpu_init - assign pmu counter idx for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) - pmu->pmc[i].idx = i; -} - -/** - * kvm_pmu_vcpu_reset - reset pmu state for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) -{ - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); - struct kvm_pmu *pmu = &vcpu->arch.pmu; - int i; - - for_each_set_bit(i, &mask, 32) - kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); - - bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); -} - -/** - * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) - kvm_pmu_release_perf_event(&pmu->pmc[i]); -} - -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) -{ - u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT; - - val &= ARMV8_PMU_PMCR_N_MASK; - if (val == 0) - return BIT(ARMV8_PMU_CYCLE_IDX); - else - return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); -} - -/** - * kvm_pmu_enable_counter_mask - enable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENSET register - * - * Call perf_event_enable to start counting the perf event - */ -void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) - return; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - if (!(val & BIT(i))) - continue; - - pmc = &pmu->pmc[i]; - - /* A change in the enable state may affect the chain state */ - kvm_pmu_update_pmc_chained(vcpu, i); - kvm_pmu_create_perf_event(vcpu, i); - - /* At this point, pmc must be the canonical */ - if (pmc->perf_event) { - perf_event_enable(pmc->perf_event); - if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) - kvm_debug("fail to enable perf event\n"); - } - } -} - -/** - * kvm_pmu_disable_counter_mask - disable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENCLR register - * - * Call perf_event_disable to stop counting the perf event - */ -void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - - if (!val) - return; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - if (!(val & BIT(i))) - continue; - - pmc = &pmu->pmc[i]; - - /* A change in the enable state may affect the chain state */ - kvm_pmu_update_pmc_chained(vcpu, i); - kvm_pmu_create_perf_event(vcpu, i); - - /* At this point, pmc must be the canonical */ - if (pmc->perf_event) - perf_event_disable(pmc->perf_event); - } -} - -static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) -{ - u64 reg = 0; - - if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) { - reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); - reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); - reg &= kvm_pmu_valid_counter_mask(vcpu); - } - - return reg; -} - -static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - bool overflow; - - if (!kvm_arm_pmu_v3_ready(vcpu)) - return; - - overflow = !!kvm_pmu_overflow_status(vcpu); - if (pmu->irq_level == overflow) - return; - - pmu->irq_level = overflow; - - if (likely(irqchip_in_kernel(vcpu->kvm))) { - int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - pmu->irq_num, overflow, pmu); - WARN_ON(ret); - } -} - -bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_sync_regs *sregs = &vcpu->run->s.regs; - bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; - - if (likely(irqchip_in_kernel(vcpu->kvm))) - return false; - - return pmu->irq_level != run_level; -} - -/* - * Reflect the PMU overflow interrupt output level into the kvm_run structure - */ -void kvm_pmu_update_run(struct kvm_vcpu *vcpu) -{ - struct kvm_sync_regs *regs = &vcpu->run->s.regs; - - /* Populate the timer bitmap for user space */ - regs->device_irq_level &= ~KVM_ARM_DEV_PMU; - if (vcpu->arch.pmu.irq_level) - regs->device_irq_level |= KVM_ARM_DEV_PMU; -} - -/** - * kvm_pmu_flush_hwstate - flush pmu state to cpu - * @vcpu: The vcpu pointer - * - * Check if the PMU has overflowed while we were running in the host, and inject - * an interrupt if that was the case. - */ -void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) -{ - kvm_pmu_update_state(vcpu); -} - -/** - * kvm_pmu_sync_hwstate - sync pmu state from cpu - * @vcpu: The vcpu pointer - * - * Check if the PMU has overflowed while we were running in the guest, and - * inject an interrupt if that was the case. - */ -void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) -{ - kvm_pmu_update_state(vcpu); -} - -/** - * When the perf event overflows, set the overflow status and inform the vcpu. - */ -static void kvm_pmu_perf_overflow(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu); - struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - int idx = pmc->idx; - u64 period; - - cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE); - - /* - * Reset the sample period to the architectural limit, - * i.e. the point where the counter overflows. - */ - period = -(local64_read(&perf_event->count)); - - if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) - period &= GENMASK(31, 0); - - local64_set(&perf_event->hw.period_left, 0); - perf_event->attr.sample_period = period; - perf_event->hw.sample_period = period; - - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); - - if (kvm_pmu_overflow_status(vcpu)) { - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - } - - cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD); -} - -/** - * kvm_pmu_software_increment - do software increment - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMSWINC register - */ -void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - int i; - - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) - return; - - /* Weed out disabled counters */ - val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - - for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { - u64 type, reg; - - if (!(val & BIT(i))) - continue; - - /* PMSWINC only applies to ... SW_INC! */ - type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); - type &= ARMV8_PMU_EVTYPE_EVENT; - if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) - continue; - - /* increment this even SW_INC counter */ - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; - reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; - - if (reg) /* no overflow on the low part */ - continue; - - if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) { - /* increment the high counter */ - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1; - reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg; - if (!reg) /* mark overflow on the high counter */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1); - } else { - /* mark overflow on low counter */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); - } - } -} - -/** - * kvm_pmu_handle_pmcr - handle PMCR register - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCR register - */ -void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) -{ - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); - int i; - - if (val & ARMV8_PMU_PMCR_E) { - kvm_pmu_enable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); - } else { - kvm_pmu_disable_counter_mask(vcpu, mask); - } - - if (val & ARMV8_PMU_PMCR_C) - kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); - - if (val & ARMV8_PMU_PMCR_P) { - for_each_set_bit(i, &mask, 32) - kvm_pmu_set_counter_value(vcpu, i, 0); - } -} - -static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx) -{ - return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && - (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx)); -} - -/** - * kvm_pmu_create_perf_event - create a perf event for a counter - * @vcpu: The vcpu pointer - * @select_idx: The number of selected counter - */ -static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - struct perf_event *event; - struct perf_event_attr attr; - u64 eventsel, counter, reg, data; - - /* - * For chained counters the event type and filtering attributes are - * obtained from the low/even counter. We also use this counter to - * determine if the event is enabled/disabled. - */ - pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]); - - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx; - data = __vcpu_sys_reg(vcpu, reg); - - kvm_pmu_stop_counter(vcpu, pmc); - eventsel = data & ARMV8_PMU_EVTYPE_EVENT; - - /* Software increment event does't need to be backed by a perf event */ - if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && - pmc->idx != ARMV8_PMU_CYCLE_IDX) - return; - - memset(&attr, 0, sizeof(struct perf_event_attr)); - attr.type = PERF_TYPE_RAW; - attr.size = sizeof(attr); - attr.pinned = 1; - attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx); - attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0; - attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; - attr.exclude_hv = 1; /* Don't count EL2 events */ - attr.exclude_host = 1; /* Don't count host events */ - attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ? - ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel; - - counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); - - if (kvm_pmu_pmc_is_chained(pmc)) { - /** - * The initial sample period (overflow count) of an event. For - * chained counters we only support overflow interrupts on the - * high counter. - */ - attr.sample_period = (-counter) & GENMASK(63, 0); - attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; - - event = perf_event_create_kernel_counter(&attr, -1, current, - kvm_pmu_perf_overflow, - pmc + 1); - } else { - /* The initial sample period (overflow count) of an event. */ - if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) - attr.sample_period = (-counter) & GENMASK(63, 0); - else - attr.sample_period = (-counter) & GENMASK(31, 0); - - event = perf_event_create_kernel_counter(&attr, -1, current, - kvm_pmu_perf_overflow, pmc); - } - - if (IS_ERR(event)) { - pr_err_once("kvm: pmu event creation failed %ld\n", - PTR_ERR(event)); - return; - } - - pmc->perf_event = event; -} - -/** - * kvm_pmu_update_pmc_chained - update chained bitmap - * @vcpu: The vcpu pointer - * @select_idx: The number of selected counter - * - * Update the chained bitmap based on the event type written in the - * typer register and the enable state of the odd register. - */ -static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc; - bool new_state, old_state; - - old_state = kvm_pmu_pmc_is_chained(pmc); - new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) && - kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1); - - if (old_state == new_state) - return; - - canonical_pmc = kvm_pmu_get_canonical_pmc(pmc); - kvm_pmu_stop_counter(vcpu, canonical_pmc); - if (new_state) { - /* - * During promotion from !chained to chained we must ensure - * the adjacent counter is stopped and its event destroyed - */ - kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc)); - set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); - return; - } - clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); -} - -/** - * kvm_pmu_set_counter_event_type - set selected counter to monitor some event - * @vcpu: The vcpu pointer - * @data: The data guest writes to PMXEVTYPER_EL0 - * @select_idx: The number of selected counter - * - * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an - * event with given hardware event number. Here we call perf_event API to - * emulate this action and create a kernel perf event for it. - */ -void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, - u64 select_idx) -{ - u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK; - - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx; - - __vcpu_sys_reg(vcpu, reg) = event_type; - - kvm_pmu_update_pmc_chained(vcpu, select_idx); - kvm_pmu_create_perf_event(vcpu, select_idx); -} - -bool kvm_arm_support_pmu_v3(void) -{ - /* - * Check if HW_PERF_EVENTS are supported by checking the number of - * hardware performance counters. This could ensure the presence of - * a physical PMU and CONFIG_PERF_EVENT is selected. - */ - return (perf_num_counters() > 0); -} - -int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.pmu.created) - return 0; - - /* - * A valid interrupt configuration for the PMU is either to have a - * properly configured interrupt number and using an in-kernel - * irqchip, or to not have an in-kernel GIC and not set an IRQ. - */ - if (irqchip_in_kernel(vcpu->kvm)) { - int irq = vcpu->arch.pmu.irq_num; - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -EINVAL; - - /* - * If we are using an in-kernel vgic, at this point we know - * the vgic will be initialized, so we can check the PMU irq - * number against the dimensions of the vgic and make sure - * it's valid. - */ - if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) - return -EINVAL; - } else if (kvm_arm_pmu_irq_initialized(vcpu)) { - return -EINVAL; - } - - kvm_pmu_vcpu_reset(vcpu); - vcpu->arch.pmu.ready = true; - - return 0; -} - -static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) -{ - if (!kvm_arm_support_pmu_v3()) - return -ENODEV; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENXIO; - - if (vcpu->arch.pmu.created) - return -EBUSY; - - if (irqchip_in_kernel(vcpu->kvm)) { - int ret; - - /* - * If using the PMU with an in-kernel virtual GIC - * implementation, we require the GIC to be already - * initialized when initializing the PMU. - */ - if (!vgic_initialized(vcpu->kvm)) - return -ENODEV; - - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -ENXIO; - - ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, - &vcpu->arch.pmu); - if (ret) - return ret; - } - - vcpu->arch.pmu.created = true; - return 0; -} - -/* - * For one VM the interrupt type must be same for each vcpu. - * As a PPI, the interrupt number is the same for all vcpus, - * while as an SPI it must be a separate number per vcpu. - */ -static bool pmu_irq_is_valid(struct kvm *kvm, int irq) -{ - int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_arm_pmu_irq_initialized(vcpu)) - continue; - - if (irq_is_ppi(irq)) { - if (vcpu->arch.pmu.irq_num != irq) - return false; - } else { - if (vcpu->arch.pmu.irq_num == irq) - return false; - } - } - - return true; -} - -int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: { - int __user *uaddr = (int __user *)(long)attr->addr; - int irq; - - if (!irqchip_in_kernel(vcpu->kvm)) - return -EINVAL; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENODEV; - - if (get_user(irq, uaddr)) - return -EFAULT; - - /* The PMU overflow interrupt can be a PPI or a valid SPI. */ - if (!(irq_is_ppi(irq) || irq_is_spi(irq))) - return -EINVAL; - - if (!pmu_irq_is_valid(vcpu->kvm, irq)) - return -EINVAL; - - if (kvm_arm_pmu_irq_initialized(vcpu)) - return -EBUSY; - - kvm_debug("Set kvm ARM PMU irq: %d\n", irq); - vcpu->arch.pmu.irq_num = irq; - return 0; - } - case KVM_ARM_VCPU_PMU_V3_INIT: - return kvm_arm_pmu_v3_init(vcpu); - } - - return -ENXIO; -} - -int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: { - int __user *uaddr = (int __user *)(long)attr->addr; - int irq; - - if (!irqchip_in_kernel(vcpu->kvm)) - return -EINVAL; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENODEV; - - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -ENXIO; - - irq = vcpu->arch.pmu.irq_num; - return put_user(irq, uaddr); - } - } - - return -ENXIO; -} - -int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: - case KVM_ARM_VCPU_PMU_V3_INIT: - if (kvm_arm_support_pmu_v3() && - test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return 0; - } - - return -ENXIO; -} diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c deleted file mode 100644 index ae364716ee40..000000000000 --- a/virt/kvm/arm/psci.c +++ /dev/null @@ -1,564 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - ARM Ltd - * Author: Marc Zyngier - */ - -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -/* - * This is an implementation of the Power State Coordination Interface - * as described in ARM document number ARM DEN 0022A. - */ - -#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) - -static unsigned long psci_affinity_mask(unsigned long affinity_level) -{ - if (affinity_level <= 3) - return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level); - - return 0; -} - -static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) -{ - /* - * NOTE: For simplicity, we make VCPU suspend emulation to be - * same-as WFI (Wait-for-interrupt) emulation. - * - * This means for KVM the wakeup events are interrupts and - * this is consistent with intended use of StateID as described - * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A). - * - * Further, we also treat power-down request to be same as - * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2 - * specification (ARM DEN 0022A). This means all suspend states - * for KVM will preserve the register state. - */ - kvm_vcpu_block(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); - - return PSCI_RET_SUCCESS; -} - -static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) -{ - vcpu->arch.power_off = true; - kvm_make_request(KVM_REQ_SLEEP, vcpu); - kvm_vcpu_kick(vcpu); -} - -static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) -{ - struct vcpu_reset_state *reset_state; - struct kvm *kvm = source_vcpu->kvm; - struct kvm_vcpu *vcpu = NULL; - unsigned long cpu_id; - - cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; - if (vcpu_mode_is_32bit(source_vcpu)) - cpu_id &= ~((u32) 0); - - vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); - - /* - * Make sure the caller requested a valid CPU and that the CPU is - * turned off. - */ - if (!vcpu) - return PSCI_RET_INVALID_PARAMS; - if (!vcpu->arch.power_off) { - if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1) - return PSCI_RET_ALREADY_ON; - else - return PSCI_RET_INVALID_PARAMS; - } - - reset_state = &vcpu->arch.reset_state; - - reset_state->pc = smccc_get_arg2(source_vcpu); - - /* Propagate caller endianness */ - reset_state->be = kvm_vcpu_is_be(source_vcpu); - - /* - * NOTE: We always update r0 (or x0) because for PSCI v0.1 - * the general puspose registers are undefined upon CPU_ON. - */ - reset_state->r0 = smccc_get_arg3(source_vcpu); - - WRITE_ONCE(reset_state->reset, true); - kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); - - /* - * Make sure the reset request is observed if the change to - * power_state is observed. - */ - smp_wmb(); - - vcpu->arch.power_off = false; - kvm_vcpu_wake_up(vcpu); - - return PSCI_RET_SUCCESS; -} - -static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) -{ - int i, matching_cpus = 0; - unsigned long mpidr; - unsigned long target_affinity; - unsigned long target_affinity_mask; - unsigned long lowest_affinity_level; - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *tmp; - - target_affinity = smccc_get_arg1(vcpu); - lowest_affinity_level = smccc_get_arg2(vcpu); - - /* Determine target affinity mask */ - target_affinity_mask = psci_affinity_mask(lowest_affinity_level); - if (!target_affinity_mask) - return PSCI_RET_INVALID_PARAMS; - - /* Ignore other bits of target affinity */ - target_affinity &= target_affinity_mask; - - /* - * If one or more VCPU matching target affinity are running - * then ON else OFF - */ - kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr_aff(tmp); - if ((mpidr & target_affinity_mask) == target_affinity) { - matching_cpus++; - if (!tmp->arch.power_off) - return PSCI_0_2_AFFINITY_LEVEL_ON; - } - } - - if (!matching_cpus) - return PSCI_RET_INVALID_PARAMS; - - return PSCI_0_2_AFFINITY_LEVEL_OFF; -} - -static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) -{ - int i; - struct kvm_vcpu *tmp; - - /* - * The KVM ABI specifies that a system event exit may call KVM_RUN - * again and may perform shutdown/reboot at a later time that when the - * actual request is made. Since we are implementing PSCI and a - * caller of PSCI reboot and shutdown expects that the system shuts - * down or reboots immediately, let's make sure that VCPUs are not run - * after this call is handled and before the VCPUs have been - * re-initialized. - */ - kvm_for_each_vcpu(i, tmp, vcpu->kvm) - tmp->arch.power_off = true; - kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); - - memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); - vcpu->run->system_event.type = type; - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; -} - -static void kvm_psci_system_off(struct kvm_vcpu *vcpu) -{ - kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN); -} - -static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) -{ - kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET); -} - -static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) -{ - int i; - - /* - * Zero the input registers' upper 32 bits. They will be fully - * zeroed on exit, so we're fine changing them in place. - */ - for (i = 1; i < 4; i++) - vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i))); -} - -static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn) -{ - switch(fn) { - case PSCI_0_2_FN64_CPU_SUSPEND: - case PSCI_0_2_FN64_CPU_ON: - case PSCI_0_2_FN64_AFFINITY_INFO: - /* Disallow these functions for 32bit guests */ - if (vcpu_mode_is_32bit(vcpu)) - return PSCI_RET_NOT_SUPPORTED; - break; - } - - return 0; -} - -static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - u32 psci_fn = smccc_get_function(vcpu); - unsigned long val; - int ret = 1; - - val = kvm_psci_check_allowed_function(vcpu, psci_fn); - if (val) - goto out; - - switch (psci_fn) { - case PSCI_0_2_FN_PSCI_VERSION: - /* - * Bits[31:16] = Major Version = 0 - * Bits[15:0] = Minor Version = 2 - */ - val = KVM_ARM_PSCI_0_2; - break; - case PSCI_0_2_FN_CPU_SUSPEND: - case PSCI_0_2_FN64_CPU_SUSPEND: - val = kvm_psci_vcpu_suspend(vcpu); - break; - case PSCI_0_2_FN_CPU_OFF: - kvm_psci_vcpu_off(vcpu); - val = PSCI_RET_SUCCESS; - break; - case PSCI_0_2_FN_CPU_ON: - kvm_psci_narrow_to_32bit(vcpu); - fallthrough; - case PSCI_0_2_FN64_CPU_ON: - mutex_lock(&kvm->lock); - val = kvm_psci_vcpu_on(vcpu); - mutex_unlock(&kvm->lock); - break; - case PSCI_0_2_FN_AFFINITY_INFO: - kvm_psci_narrow_to_32bit(vcpu); - fallthrough; - case PSCI_0_2_FN64_AFFINITY_INFO: - val = kvm_psci_vcpu_affinity_info(vcpu); - break; - case PSCI_0_2_FN_MIGRATE_INFO_TYPE: - /* - * Trusted OS is MP hence does not require migration - * or - * Trusted OS is not present - */ - val = PSCI_0_2_TOS_MP; - break; - case PSCI_0_2_FN_SYSTEM_OFF: - kvm_psci_system_off(vcpu); - /* - * We should'nt be going back to guest VCPU after - * receiving SYSTEM_OFF request. - * - * If user space accidently/deliberately resumes - * guest VCPU after SYSTEM_OFF request then guest - * VCPU should see internal failure from PSCI return - * value. To achieve this, we preload r0 (or x0) with - * PSCI return value INTERNAL_FAILURE. - */ - val = PSCI_RET_INTERNAL_FAILURE; - ret = 0; - break; - case PSCI_0_2_FN_SYSTEM_RESET: - kvm_psci_system_reset(vcpu); - /* - * Same reason as SYSTEM_OFF for preloading r0 (or x0) - * with PSCI return value INTERNAL_FAILURE. - */ - val = PSCI_RET_INTERNAL_FAILURE; - ret = 0; - break; - default: - val = PSCI_RET_NOT_SUPPORTED; - break; - } - -out: - smccc_set_retval(vcpu, val, 0, 0, 0); - return ret; -} - -static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu) -{ - u32 psci_fn = smccc_get_function(vcpu); - u32 feature; - unsigned long val; - int ret = 1; - - switch(psci_fn) { - case PSCI_0_2_FN_PSCI_VERSION: - val = KVM_ARM_PSCI_1_0; - break; - case PSCI_1_0_FN_PSCI_FEATURES: - feature = smccc_get_arg1(vcpu); - val = kvm_psci_check_allowed_function(vcpu, feature); - if (val) - break; - - switch(feature) { - case PSCI_0_2_FN_PSCI_VERSION: - case PSCI_0_2_FN_CPU_SUSPEND: - case PSCI_0_2_FN64_CPU_SUSPEND: - case PSCI_0_2_FN_CPU_OFF: - case PSCI_0_2_FN_CPU_ON: - case PSCI_0_2_FN64_CPU_ON: - case PSCI_0_2_FN_AFFINITY_INFO: - case PSCI_0_2_FN64_AFFINITY_INFO: - case PSCI_0_2_FN_MIGRATE_INFO_TYPE: - case PSCI_0_2_FN_SYSTEM_OFF: - case PSCI_0_2_FN_SYSTEM_RESET: - case PSCI_1_0_FN_PSCI_FEATURES: - case ARM_SMCCC_VERSION_FUNC_ID: - val = 0; - break; - default: - val = PSCI_RET_NOT_SUPPORTED; - break; - } - break; - default: - return kvm_psci_0_2_call(vcpu); - } - - smccc_set_retval(vcpu, val, 0, 0, 0); - return ret; -} - -static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - u32 psci_fn = smccc_get_function(vcpu); - unsigned long val; - - switch (psci_fn) { - case KVM_PSCI_FN_CPU_OFF: - kvm_psci_vcpu_off(vcpu); - val = PSCI_RET_SUCCESS; - break; - case KVM_PSCI_FN_CPU_ON: - mutex_lock(&kvm->lock); - val = kvm_psci_vcpu_on(vcpu); - mutex_unlock(&kvm->lock); - break; - default: - val = PSCI_RET_NOT_SUPPORTED; - break; - } - - smccc_set_retval(vcpu, val, 0, 0, 0); - return 1; -} - -/** - * kvm_psci_call - handle PSCI call if r0 value is in range - * @vcpu: Pointer to the VCPU struct - * - * Handle PSCI calls from guests through traps from HVC instructions. - * The calling convention is similar to SMC calls to the secure world - * where the function number is placed in r0. - * - * This function returns: > 0 (success), 0 (success but exit to user - * space), and < 0 (errors) - * - * Errors: - * -EINVAL: Unrecognized PSCI function - */ -int kvm_psci_call(struct kvm_vcpu *vcpu) -{ - switch (kvm_psci_version(vcpu, vcpu->kvm)) { - case KVM_ARM_PSCI_1_0: - return kvm_psci_1_0_call(vcpu); - case KVM_ARM_PSCI_0_2: - return kvm_psci_0_2_call(vcpu); - case KVM_ARM_PSCI_0_1: - return kvm_psci_0_1_call(vcpu); - default: - return -EINVAL; - }; -} - -int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) -{ - return 3; /* PSCI version and two workaround registers */ -} - -int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++)) - return -EFAULT; - - if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++)) - return -EFAULT; - - if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++)) - return -EFAULT; - - return 0; -} - -#define KVM_REG_FEATURE_LEVEL_WIDTH 4 -#define KVM_REG_FEATURE_LEVEL_MASK (BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1) - -/* - * Convert the workaround level into an easy-to-compare number, where higher - * values mean better protection. - */ -static int get_kernel_wa_level(u64 regid) -{ - switch (regid) { - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - switch (kvm_arm_harden_branch_predictor()) { - case KVM_BP_HARDEN_UNKNOWN: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; - case KVM_BP_HARDEN_WA_NEEDED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL; - case KVM_BP_HARDEN_NOT_REQUIRED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED; - } - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - switch (kvm_arm_have_ssbd()) { - case KVM_SSBD_FORCE_DISABLE: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; - case KVM_SSBD_KERNEL: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL; - case KVM_SSBD_FORCE_ENABLE: - case KVM_SSBD_MITIGATED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; - case KVM_SSBD_UNKNOWN: - default: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN; - } - } - - return -EINVAL; -} - -int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - - switch (reg->id) { - case KVM_REG_ARM_PSCI_VERSION: - val = kvm_psci_version(vcpu, vcpu->kvm); - break; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; - break; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; - - if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL && - kvm_arm_get_vcpu_workaround_2_flag(vcpu)) - val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED; - break; - default: - return -ENOENT; - } - - if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - return 0; -} - -int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - int wa_level; - - if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - switch (reg->id) { - case KVM_REG_ARM_PSCI_VERSION: - { - bool wants_02; - - wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); - - switch (val) { - case KVM_ARM_PSCI_0_1: - if (wants_02) - return -EINVAL; - vcpu->kvm->arch.psci_version = val; - return 0; - case KVM_ARM_PSCI_0_2: - case KVM_ARM_PSCI_1_0: - if (!wants_02) - return -EINVAL; - vcpu->kvm->arch.psci_version = val; - return 0; - } - break; - } - - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - if (val & ~KVM_REG_FEATURE_LEVEL_MASK) - return -EINVAL; - - if (get_kernel_wa_level(reg->id) < val) - return -EINVAL; - - return 0; - - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - if (val & ~(KVM_REG_FEATURE_LEVEL_MASK | - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED)) - return -EINVAL; - - wa_level = val & KVM_REG_FEATURE_LEVEL_MASK; - - if (get_kernel_wa_level(reg->id) < wa_level) - return -EINVAL; - - /* The enabled bit must not be set unless the level is AVAIL. */ - if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL && - wa_level != val) - return -EINVAL; - - /* Are we finished or do we need to check the enable bit ? */ - if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL) - return 0; - - /* - * If this kernel supports the workaround to be switched on - * or off, make sure it matches the requested setting. - */ - switch (wa_level) { - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: - kvm_arm_set_vcpu_workaround_2_flag(vcpu, - val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED); - break; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: - kvm_arm_set_vcpu_workaround_2_flag(vcpu, true); - break; - } - - return 0; - default: - return -ENOENT; - } - - return -EINVAL; -} diff --git a/virt/kvm/arm/pvtime.c b/virt/kvm/arm/pvtime.c deleted file mode 100644 index 1e0f4c284888..000000000000 --- a/virt/kvm/arm/pvtime.c +++ /dev/null @@ -1,131 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2019 Arm Ltd. - -#include -#include - -#include -#include - -#include - -void kvm_update_stolen_time(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - u64 steal; - __le64 steal_le; - u64 offset; - int idx; - u64 base = vcpu->arch.steal.base; - - if (base == GPA_INVALID) - return; - - /* Let's do the local bookkeeping */ - steal = vcpu->arch.steal.steal; - steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal; - vcpu->arch.steal.last_steal = current->sched_info.run_delay; - vcpu->arch.steal.steal = steal; - - steal_le = cpu_to_le64(steal); - idx = srcu_read_lock(&kvm->srcu); - offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); - kvm_put_guest(kvm, base + offset, steal_le, u64); - srcu_read_unlock(&kvm->srcu, idx); -} - -long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) -{ - u32 feature = smccc_get_arg1(vcpu); - long val = SMCCC_RET_NOT_SUPPORTED; - - switch (feature) { - case ARM_SMCCC_HV_PV_TIME_FEATURES: - case ARM_SMCCC_HV_PV_TIME_ST: - val = SMCCC_RET_SUCCESS; - break; - } - - return val; -} - -gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) -{ - struct pvclock_vcpu_stolen_time init_values = {}; - struct kvm *kvm = vcpu->kvm; - u64 base = vcpu->arch.steal.base; - int idx; - - if (base == GPA_INVALID) - return base; - - /* - * Start counting stolen time from the time the guest requests - * the feature enabled. - */ - vcpu->arch.steal.steal = 0; - vcpu->arch.steal.last_steal = current->sched_info.run_delay; - - idx = srcu_read_lock(&kvm->srcu); - kvm_write_guest(kvm, base, &init_values, sizeof(init_values)); - srcu_read_unlock(&kvm->srcu, idx); - - return base; -} - -int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - u64 __user *user = (u64 __user *)attr->addr; - struct kvm *kvm = vcpu->kvm; - u64 ipa; - int ret = 0; - int idx; - - if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) - return -ENXIO; - - if (get_user(ipa, user)) - return -EFAULT; - if (!IS_ALIGNED(ipa, 64)) - return -EINVAL; - if (vcpu->arch.steal.base != GPA_INVALID) - return -EEXIST; - - /* Check the address is in a valid memslot */ - idx = srcu_read_lock(&kvm->srcu); - if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT))) - ret = -EINVAL; - srcu_read_unlock(&kvm->srcu, idx); - - if (!ret) - vcpu->arch.steal.base = ipa; - - return ret; -} - -int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - u64 __user *user = (u64 __user *)attr->addr; - u64 ipa; - - if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) - return -ENXIO; - - ipa = vcpu->arch.steal.base; - - if (put_user(ipa, user)) - return -EFAULT; - return 0; -} - -int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PVTIME_IPA: - return 0; - } - return -ENXIO; -} diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h deleted file mode 100644 index cc94ccc68821..000000000000 --- a/virt/kvm/arm/trace.h +++ /dev/null @@ -1,379 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVM_H - -#include -#include -#include - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -/* - * Tracepoints for entry/exit to guest - */ -TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned long vcpu_pc), - TP_ARGS(vcpu_pc), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - ), - - TP_printk("PC: 0x%08lx", __entry->vcpu_pc) -); - -TRACE_EVENT(kvm_exit, - TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), - TP_ARGS(ret, esr_ec, vcpu_pc), - - TP_STRUCT__entry( - __field( int, ret ) - __field( unsigned int, esr_ec ) - __field( unsigned long, vcpu_pc ) - ), - - TP_fast_assign( - __entry->ret = ARM_EXCEPTION_CODE(ret); - __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; - __entry->vcpu_pc = vcpu_pc; - ), - - TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", - __print_symbolic(__entry->ret, kvm_arm_exception_type), - __entry->esr_ec, - __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), - __entry->vcpu_pc) -); - -TRACE_EVENT(kvm_guest_fault, - TP_PROTO(unsigned long vcpu_pc, unsigned long hsr, - unsigned long hxfar, - unsigned long long ipa), - TP_ARGS(vcpu_pc, hsr, hxfar, ipa), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( unsigned long, hsr ) - __field( unsigned long, hxfar ) - __field( unsigned long long, ipa ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->hsr = hsr; - __entry->hxfar = hxfar; - __entry->ipa = ipa; - ), - - TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx", - __entry->ipa, __entry->hsr, - __entry->hxfar, __entry->vcpu_pc) -); - -TRACE_EVENT(kvm_access_fault, - TP_PROTO(unsigned long ipa), - TP_ARGS(ipa), - - TP_STRUCT__entry( - __field( unsigned long, ipa ) - ), - - TP_fast_assign( - __entry->ipa = ipa; - ), - - TP_printk("IPA: %lx", __entry->ipa) -); - -TRACE_EVENT(kvm_irq_line, - TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level), - TP_ARGS(type, vcpu_idx, irq_num, level), - - TP_STRUCT__entry( - __field( unsigned int, type ) - __field( int, vcpu_idx ) - __field( int, irq_num ) - __field( int, level ) - ), - - TP_fast_assign( - __entry->type = type; - __entry->vcpu_idx = vcpu_idx; - __entry->irq_num = irq_num; - __entry->level = level; - ), - - TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d", - (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" : - (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" : - (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN", - __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level) -); - -TRACE_EVENT(kvm_mmio_emulate, - TP_PROTO(unsigned long vcpu_pc, unsigned long instr, - unsigned long cpsr), - TP_ARGS(vcpu_pc, instr, cpsr), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( unsigned long, instr ) - __field( unsigned long, cpsr ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->instr = instr; - __entry->cpsr = cpsr; - ), - - TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)", - __entry->vcpu_pc, __entry->instr, __entry->cpsr) -); - -TRACE_EVENT(kvm_unmap_hva_range, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier unmap range: %#08lx -- %#08lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_set_spte_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva) -); - -TRACE_EVENT(kvm_age_hva, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier age hva: %#08lx -- %#08lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_test_age_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier test age hva: %#08lx", __entry->hva) -); - -TRACE_EVENT(kvm_set_way_flush, - TP_PROTO(unsigned long vcpu_pc, bool cache), - TP_ARGS(vcpu_pc, cache), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( bool, cache ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->cache = cache; - ), - - TP_printk("S/W flush at 0x%016lx (cache %s)", - __entry->vcpu_pc, __entry->cache ? "on" : "off") -); - -TRACE_EVENT(kvm_toggle_cache, - TP_PROTO(unsigned long vcpu_pc, bool was, bool now), - TP_ARGS(vcpu_pc, was, now), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( bool, was ) - __field( bool, now ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->was = was; - __entry->now = now; - ), - - TP_printk("VM op at 0x%016lx (cache was %s, now %s)", - __entry->vcpu_pc, __entry->was ? "on" : "off", - __entry->now ? "on" : "off") -); - -/* - * Tracepoints for arch_timer - */ -TRACE_EVENT(kvm_timer_update_irq, - TP_PROTO(unsigned long vcpu_id, __u32 irq, int level), - TP_ARGS(vcpu_id, irq, level), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_id ) - __field( __u32, irq ) - __field( int, level ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->irq = irq; - __entry->level = level; - ), - - TP_printk("VCPU: %ld, IRQ %d, level %d", - __entry->vcpu_id, __entry->irq, __entry->level) -); - -TRACE_EVENT(kvm_get_timer_map, - TP_PROTO(unsigned long vcpu_id, struct timer_map *map), - TP_ARGS(vcpu_id, map), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_id ) - __field( int, direct_vtimer ) - __field( int, direct_ptimer ) - __field( int, emul_ptimer ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer); - __entry->direct_ptimer = - (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1; - __entry->emul_ptimer = - (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1; - ), - - TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d", - __entry->vcpu_id, - __entry->direct_vtimer, - __entry->direct_ptimer, - __entry->emul_ptimer) -); - -TRACE_EVENT(kvm_timer_save_state, - TP_PROTO(struct arch_timer_context *ctx), - TP_ARGS(ctx), - - TP_STRUCT__entry( - __field( unsigned long, ctl ) - __field( unsigned long long, cval ) - __field( int, timer_idx ) - ), - - TP_fast_assign( - __entry->ctl = ctx->cnt_ctl; - __entry->cval = ctx->cnt_cval; - __entry->timer_idx = arch_timer_ctx_index(ctx); - ), - - TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", - __entry->ctl, - __entry->cval, - __entry->timer_idx) -); - -TRACE_EVENT(kvm_timer_restore_state, - TP_PROTO(struct arch_timer_context *ctx), - TP_ARGS(ctx), - - TP_STRUCT__entry( - __field( unsigned long, ctl ) - __field( unsigned long long, cval ) - __field( int, timer_idx ) - ), - - TP_fast_assign( - __entry->ctl = ctx->cnt_ctl; - __entry->cval = ctx->cnt_cval; - __entry->timer_idx = arch_timer_ctx_index(ctx); - ), - - TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", - __entry->ctl, - __entry->cval, - __entry->timer_idx) -); - -TRACE_EVENT(kvm_timer_hrtimer_expire, - TP_PROTO(struct arch_timer_context *ctx), - TP_ARGS(ctx), - - TP_STRUCT__entry( - __field( int, timer_idx ) - ), - - TP_fast_assign( - __entry->timer_idx = arch_timer_ctx_index(ctx); - ), - - TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx) -); - -TRACE_EVENT(kvm_timer_emulate, - TP_PROTO(struct arch_timer_context *ctx, bool should_fire), - TP_ARGS(ctx, should_fire), - - TP_STRUCT__entry( - __field( int, timer_idx ) - __field( bool, should_fire ) - ), - - TP_fast_assign( - __entry->timer_idx = arch_timer_ctx_index(ctx); - __entry->should_fire = should_fire; - ), - - TP_printk("arch_timer_ctx_index: %d (should_fire: %d)", - __entry->timer_idx, __entry->should_fire) -); - -#endif /* _TRACE_KVM_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../virt/kvm/arm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h deleted file mode 100644 index 4fd4f6db181b..000000000000 --- a/virt/kvm/arm/vgic/trace.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_VGIC_H - -#include - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -TRACE_EVENT(vgic_update_irq_pending, - TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), - TP_ARGS(vcpu_id, irq, level), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_id ) - __field( __u32, irq ) - __field( bool, level ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->irq = irq; - __entry->level = level; - ), - - TP_printk("VCPU: %ld, IRQ %d, level: %d", - __entry->vcpu_id, __entry->irq, __entry->level) -); - -#endif /* _TRACE_VGIC_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../virt/kvm/arm/vgic -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c deleted file mode 100644 index b13a9e3f99dd..000000000000 --- a/virt/kvm/arm/vgic/vgic-debug.c +++ /dev/null @@ -1,300 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2016 Linaro - * Author: Christoffer Dall - */ - -#include -#include -#include -#include -#include -#include -#include -#include "vgic.h" - -/* - * Structure to control looping through the entire vgic state. We start at - * zero for each field and move upwards. So, if dist_id is 0 we print the - * distributor info. When dist_id is 1, we have already printed it and move - * on. - * - * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and - * so on. - */ -struct vgic_state_iter { - int nr_cpus; - int nr_spis; - int nr_lpis; - int dist_id; - int vcpu_id; - int intid; - int lpi_idx; - u32 *lpi_array; -}; - -static void iter_next(struct vgic_state_iter *iter) -{ - if (iter->dist_id == 0) { - iter->dist_id++; - return; - } - - iter->intid++; - if (iter->intid == VGIC_NR_PRIVATE_IRQS && - ++iter->vcpu_id < iter->nr_cpus) - iter->intid = 0; - - if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) { - if (iter->lpi_idx < iter->nr_lpis) - iter->intid = iter->lpi_array[iter->lpi_idx]; - iter->lpi_idx++; - } -} - -static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, - loff_t pos) -{ - int nr_cpus = atomic_read(&kvm->online_vcpus); - - memset(iter, 0, sizeof(*iter)); - - iter->nr_cpus = nr_cpus; - iter->nr_spis = kvm->arch.vgic.nr_spis; - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array); - if (iter->nr_lpis < 0) - iter->nr_lpis = 0; - } - - /* Fast forward to the right position if needed */ - while (pos--) - iter_next(iter); -} - -static bool end_of_vgic(struct vgic_state_iter *iter) -{ - return iter->dist_id > 0 && - iter->vcpu_id == iter->nr_cpus && - iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && - iter->lpi_idx > iter->nr_lpis; -} - -static void *vgic_debug_start(struct seq_file *s, loff_t *pos) -{ - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter; - - mutex_lock(&kvm->lock); - iter = kvm->arch.vgic.iter; - if (iter) { - iter = ERR_PTR(-EBUSY); - goto out; - } - - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) { - iter = ERR_PTR(-ENOMEM); - goto out; - } - - iter_init(kvm, iter, *pos); - kvm->arch.vgic.iter = iter; - - if (end_of_vgic(iter)) - iter = NULL; -out: - mutex_unlock(&kvm->lock); - return iter; -} - -static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter = kvm->arch.vgic.iter; - - ++*pos; - iter_next(iter); - if (end_of_vgic(iter)) - iter = NULL; - return iter; -} - -static void vgic_debug_stop(struct seq_file *s, void *v) -{ - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter; - - /* - * If the seq file wasn't properly opened, there's nothing to clearn - * up. - */ - if (IS_ERR(v)) - return; - - mutex_lock(&kvm->lock); - iter = kvm->arch.vgic.iter; - kfree(iter->lpi_array); - kfree(iter); - kvm->arch.vgic.iter = NULL; - mutex_unlock(&kvm->lock); -} - -static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) -{ - bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; - - seq_printf(s, "Distributor\n"); - seq_printf(s, "===========\n"); - seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); - seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); - if (v3) - seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count); - seq_printf(s, "enabled:\t%d\n", dist->enabled); - seq_printf(s, "\n"); - - seq_printf(s, "P=pending_latch, L=line_level, A=active\n"); - seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n"); - seq_printf(s, "G=group\n"); -} - -static void print_header(struct seq_file *s, struct vgic_irq *irq, - struct kvm_vcpu *vcpu) -{ - int id = 0; - char *hdr = "SPI "; - - if (vcpu) { - hdr = "VCPU"; - id = vcpu->vcpu_id; - } - - seq_printf(s, "\n"); - seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHCG HWID TARGET SRC PRI VCPU_ID\n", hdr, id); - seq_printf(s, "----------------------------------------------------------------\n"); -} - -static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, - struct kvm_vcpu *vcpu) -{ - char *type; - bool pending; - - if (irq->intid < VGIC_NR_SGIS) - type = "SGI"; - else if (irq->intid < VGIC_NR_PRIVATE_IRQS) - type = "PPI"; - else if (irq->intid < VGIC_MAX_SPI) - type = "SPI"; - else - type = "LPI"; - - if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS) - print_header(s, irq, vcpu); - - pending = irq->pending_latch; - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - int err; - - err = irq_get_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - &pending); - WARN_ON_ONCE(err); - } - - seq_printf(s, " %s %4d " - " %2d " - "%d%d%d%d%d%d%d " - "%8d " - "%8x " - " %2x " - "%3d " - " %2d " - "\n", - type, irq->intid, - (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1, - pending, - irq->line_level, - irq->active, - irq->enabled, - irq->hw, - irq->config == VGIC_CONFIG_LEVEL, - irq->group, - irq->hwintid, - irq->mpidr, - irq->source, - irq->priority, - (irq->vcpu) ? irq->vcpu->vcpu_id : -1); -} - -static int vgic_debug_show(struct seq_file *s, void *v) -{ - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter = (struct vgic_state_iter *)v; - struct vgic_irq *irq; - struct kvm_vcpu *vcpu = NULL; - unsigned long flags; - - if (iter->dist_id == 0) { - print_dist_state(s, &kvm->arch.vgic); - return 0; - } - - if (!kvm->arch.vgic.initialized) - return 0; - - if (iter->vcpu_id < iter->nr_cpus) - vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); - - irq = vgic_get_irq(kvm, vcpu, iter->intid); - if (!irq) { - seq_printf(s, " LPI %4d freed\n", iter->intid); - return 0; - } - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - print_irq_state(s, irq, vcpu); - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(kvm, irq); - return 0; -} - -static const struct seq_operations vgic_debug_seq_ops = { - .start = vgic_debug_start, - .next = vgic_debug_next, - .stop = vgic_debug_stop, - .show = vgic_debug_show -}; - -static int debug_open(struct inode *inode, struct file *file) -{ - int ret; - ret = seq_open(file, &vgic_debug_seq_ops); - if (!ret) { - struct seq_file *seq; - /* seq_open will have modified file->private_data */ - seq = file->private_data; - seq->private = inode->i_private; - } - - return ret; -}; - -static const struct file_operations vgic_debug_fops = { - .owner = THIS_MODULE, - .open = debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release -}; - -void vgic_debug_init(struct kvm *kvm) -{ - debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm, - &vgic_debug_fops); -} - -void vgic_debug_destroy(struct kvm *kvm) -{ -} diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c deleted file mode 100644 index 32e32d67a127..000000000000 --- a/virt/kvm/arm/vgic/vgic-init.c +++ /dev/null @@ -1,556 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015, 2016 ARM Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include "vgic.h" - -/* - * Initialization rules: there are multiple stages to the vgic - * initialization, both for the distributor and the CPU interfaces. The basic - * idea is that even though the VGIC is not functional or not requested from - * user space, the critical path of the run loop can still call VGIC functions - * that just won't do anything, without them having to check additional - * initialization flags to ensure they don't look at uninitialized data - * structures. - * - * Distributor: - * - * - kvm_vgic_early_init(): initialization of static data that doesn't - * depend on any sizing information or emulation type. No allocation - * is allowed there. - * - * - vgic_init(): allocation and initialization of the generic data - * structures that depend on sizing information (number of CPUs, - * number of interrupts). Also initializes the vcpu specific data - * structures. Can be executed lazily for GICv2. - * - * CPU Interface: - * - * - kvm_vgic_vcpu_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. - */ - -/* EARLY INIT */ - -/** - * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures - * @kvm: The VM whose VGIC districutor should be initialized - * - * Only do initialization of static structures that don't require any - * allocation or sizing information from userspace. vgic_init() called - * kvm_vgic_dist_init() which takes care of the rest. - */ -void kvm_vgic_early_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - INIT_LIST_HEAD(&dist->lpi_list_head); - INIT_LIST_HEAD(&dist->lpi_translation_cache); - raw_spin_lock_init(&dist->lpi_list_lock); -} - -/* CREATION */ - -/** - * kvm_vgic_create: triggered by the instantiation of the VGIC device by - * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) - * or through the generic KVM_CREATE_DEVICE API ioctl. - * irqchip_in_kernel() tells you if this function succeeded or not. - * @kvm: kvm struct pointer - * @type: KVM_DEV_TYPE_ARM_VGIC_V[23] - */ -int kvm_vgic_create(struct kvm *kvm, u32 type) -{ - int i, ret; - struct kvm_vcpu *vcpu; - - if (irqchip_in_kernel(kvm)) - return -EEXIST; - - /* - * This function is also called by the KVM_CREATE_IRQCHIP handler, - * which had no chance yet to check the availability of the GICv2 - * emulation. So check this here again. KVM_CREATE_DEVICE does - * the proper checks already. - */ - if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && - !kvm_vgic_global_state.can_emulate_gicv2) - return -ENODEV; - - ret = -EBUSY; - if (!lock_all_vcpus(kvm)) - return ret; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->arch.has_run_once) - goto out_unlock; - } - ret = 0; - - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) - kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS; - else - kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS; - - if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) { - ret = -E2BIG; - goto out_unlock; - } - - kvm->arch.vgic.in_kernel = true; - kvm->arch.vgic.vgic_model = type; - - kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) - kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - else - INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); - -out_unlock: - unlock_all_vcpus(kvm); - return ret; -} - -/* INIT/DESTROY */ - -/** - * kvm_vgic_dist_init: initialize the dist data structures - * @kvm: kvm struct pointer - * @nr_spis: number of spis, frozen by caller - */ -static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); - int i; - - dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); - if (!dist->spis) - return -ENOMEM; - - /* - * In the following code we do not take the irq struct lock since - * no other action on irq structs can happen while the VGIC is - * not initialized yet: - * If someone wants to inject an interrupt or does a MMIO access, we - * require prior initialization in case of a virtual GICv3 or trigger - * initialization when using a virtual GICv2. - */ - for (i = 0; i < nr_spis; i++) { - struct vgic_irq *irq = &dist->spis[i]; - - irq->intid = i + VGIC_NR_PRIVATE_IRQS; - INIT_LIST_HEAD(&irq->ap_list); - raw_spin_lock_init(&irq->irq_lock); - irq->vcpu = NULL; - irq->target_vcpu = vcpu0; - kref_init(&irq->refcount); - switch (dist->vgic_model) { - case KVM_DEV_TYPE_ARM_VGIC_V2: - irq->targets = 0; - irq->group = 0; - break; - case KVM_DEV_TYPE_ARM_VGIC_V3: - irq->mpidr = 0; - irq->group = 1; - break; - default: - kfree(dist->spis); - dist->spis = NULL; - return -EINVAL; - } - } - return 0; -} - -/** - * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data - * structures and register VCPU-specific KVM iodevs - * - * @vcpu: pointer to the VCPU being created and initialized - * - * Only do initialization, but do not actually enable the - * VGIC CPU interface - */ -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int ret = 0; - int i; - - vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; - - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - raw_spin_lock_init(&vgic_cpu->ap_list_lock); - atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); - - /* - * Enable and configure all SGIs to be edge-triggered and - * configure all PPIs as level-triggered. - */ - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; - - INIT_LIST_HEAD(&irq->ap_list); - raw_spin_lock_init(&irq->irq_lock); - irq->intid = i; - irq->vcpu = NULL; - irq->target_vcpu = vcpu; - kref_init(&irq->refcount); - if (vgic_irq_is_sgi(i)) { - /* SGIs */ - irq->enabled = 1; - irq->config = VGIC_CONFIG_EDGE; - } else { - /* PPIs */ - irq->config = VGIC_CONFIG_LEVEL; - } - } - - if (!irqchip_in_kernel(vcpu->kvm)) - return 0; - - /* - * If we are creating a VCPU with a GICv3 we must also register the - * KVM io device for the redistributor that belongs to this VCPU. - */ - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - mutex_lock(&vcpu->kvm->lock); - ret = vgic_register_redist_iodev(vcpu); - mutex_unlock(&vcpu->kvm->lock); - } - return ret; -} - -static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_enable(vcpu); - else - vgic_v3_enable(vcpu); -} - -/* - * vgic_init: allocates and initializes dist and vcpu data structures - * depending on two dimensioning parameters: - * - the number of spis - * - the number of vcpus - * The function is generally called when nr_spis has been explicitly set - * by the guest through the KVM DEVICE API. If not nr_spis is set to 256. - * vgic_initialized() returns true when this function has succeeded. - * Must be called with kvm->lock held! - */ -int vgic_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int ret = 0, i, idx; - - if (vgic_initialized(kvm)) - return 0; - - /* Are we also in the middle of creating a VCPU? */ - if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) - return -EBUSY; - - /* freeze the number of spis */ - if (!dist->nr_spis) - dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; - - ret = kvm_vgic_dist_init(kvm, dist->nr_spis); - if (ret) - goto out; - - /* Initialize groups on CPUs created before the VGIC type was known */ - kvm_for_each_vcpu(idx, vcpu, kvm) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; - switch (dist->vgic_model) { - case KVM_DEV_TYPE_ARM_VGIC_V3: - irq->group = 1; - irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - break; - case KVM_DEV_TYPE_ARM_VGIC_V2: - irq->group = 0; - irq->targets = 1U << idx; - break; - default: - ret = -EINVAL; - goto out; - } - } - } - - if (vgic_has_its(kvm)) - vgic_lpi_translation_cache_init(kvm); - - /* - * If we have GICv4.1 enabled, unconditionnaly request enable the - * v4 support so that we get HW-accelerated vSGIs. Otherwise, only - * enable it if we present a virtual ITS to the guest. - */ - if (vgic_supports_direct_msis(kvm)) { - ret = vgic_v4_init(kvm); - if (ret) - goto out; - } - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vgic_vcpu_enable(vcpu); - - ret = kvm_vgic_setup_default_irq_routing(kvm); - if (ret) - goto out; - - vgic_debug_init(kvm); - - dist->implementation_rev = 2; - dist->initialized = true; - -out: - return ret; -} - -static void kvm_vgic_dist_destroy(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_redist_region *rdreg, *next; - - dist->ready = false; - dist->initialized = false; - - kfree(dist->spis); - dist->spis = NULL; - dist->nr_spis = 0; - - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) { - list_del(&rdreg->list); - kfree(rdreg); - } - INIT_LIST_HEAD(&dist->rd_regions); - } - - if (vgic_has_its(kvm)) - vgic_lpi_translation_cache_destroy(kvm); - - if (vgic_supports_direct_msis(kvm)) - vgic_v4_teardown(kvm); -} - -void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - /* - * Retire all pending LPIs on this vcpu anyway as we're - * going to destroy it. - */ - vgic_flush_pending_lpis(vcpu); - - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); -} - -/* To be called with kvm->lock held */ -static void __kvm_vgic_destroy(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int i; - - vgic_debug_destroy(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vgic_vcpu_destroy(vcpu); - - kvm_vgic_dist_destroy(kvm); -} - -void kvm_vgic_destroy(struct kvm *kvm) -{ - mutex_lock(&kvm->lock); - __kvm_vgic_destroy(kvm); - mutex_unlock(&kvm->lock); -} - -/** - * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest - * is a GICv2. A GICv3 must be explicitly initialized by the guest using the - * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group. - * @kvm: kvm struct pointer - */ -int vgic_lazy_init(struct kvm *kvm) -{ - int ret = 0; - - if (unlikely(!vgic_initialized(kvm))) { - /* - * We only provide the automatic initialization of the VGIC - * for the legacy case of a GICv2. Any other type must - * be explicitly initialized once setup with the respective - * KVM device call. - */ - if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) - return -EBUSY; - - mutex_lock(&kvm->lock); - ret = vgic_init(kvm); - mutex_unlock(&kvm->lock); - } - - return ret; -} - -/* RESOURCE MAPPING */ - -/** - * Map the MMIO regions depending on the VGIC model exposed to the guest - * called on the first VCPU run. - * Also map the virtual CPU interface into the VM. - * v2/v3 derivatives call vgic_init if not already done. - * vgic_ready() returns true if this function has succeeded. - * @kvm: kvm struct pointer - */ -int kvm_vgic_map_resources(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int ret = 0; - - mutex_lock(&kvm->lock); - if (!irqchip_in_kernel(kvm)) - goto out; - - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) - ret = vgic_v2_map_resources(kvm); - else - ret = vgic_v3_map_resources(kvm); - - if (ret) - __kvm_vgic_destroy(kvm); - -out: - mutex_unlock(&kvm->lock); - return ret; -} - -/* GENERIC PROBE */ - -static int vgic_init_cpu_starting(unsigned int cpu) -{ - enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); - return 0; -} - - -static int vgic_init_cpu_dying(unsigned int cpu) -{ - disable_percpu_irq(kvm_vgic_global_state.maint_irq); - return 0; -} - -static irqreturn_t vgic_maintenance_handler(int irq, void *data) -{ - /* - * We cannot rely on the vgic maintenance interrupt to be - * delivered synchronously. This means we can only use it to - * exit the VM, and we perform the handling of EOIed - * interrupts on the exit path (see vgic_fold_lr_state). - */ - return IRQ_HANDLED; -} - -/** - * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware - * - * For a specific CPU, initialize the GIC VE hardware. - */ -void kvm_vgic_init_cpu_hardware(void) -{ - BUG_ON(preemptible()); - - /* - * We want to make sure the list registers start out clear so that we - * only have the program the used registers. - */ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_init_lrs(); - else - kvm_call_hyp(__vgic_v3_init_lrs); -} - -/** - * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable - * according to the host GIC model. Accordingly calls either - * vgic_v2/v3_probe which registers the KVM_DEVICE that can be - * instantiated by a guest later on . - */ -int kvm_vgic_hyp_init(void) -{ - const struct gic_kvm_info *gic_kvm_info; - int ret; - - gic_kvm_info = gic_get_kvm_info(); - if (!gic_kvm_info) - return -ENODEV; - - if (!gic_kvm_info->maint_irq) { - kvm_err("No vgic maintenance irq\n"); - return -ENXIO; - } - - switch (gic_kvm_info->type) { - case GIC_V2: - ret = vgic_v2_probe(gic_kvm_info); - break; - case GIC_V3: - ret = vgic_v3_probe(gic_kvm_info); - if (!ret) { - static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); - kvm_info("GIC system register CPU interface enabled\n"); - } - break; - default: - ret = -ENODEV; - } - - if (ret) - return ret; - - kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq; - ret = request_percpu_irq(kvm_vgic_global_state.maint_irq, - vgic_maintenance_handler, - "vgic", kvm_get_running_vcpus()); - if (ret) { - kvm_err("Cannot register interrupt %d\n", - kvm_vgic_global_state.maint_irq); - return ret; - } - - ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, - "kvm/arm/vgic:starting", - vgic_init_cpu_starting, vgic_init_cpu_dying); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - - kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); - return 0; - -out_free_irq: - free_percpu_irq(kvm_vgic_global_state.maint_irq, - kvm_get_running_vcpus()); - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c deleted file mode 100644 index d8cdfea5cc96..000000000000 --- a/virt/kvm/arm/vgic/vgic-irqfd.c +++ /dev/null @@ -1,141 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015, 2016 ARM Ltd. - */ - -#include -#include -#include -#include -#include "vgic.h" - -/** - * vgic_irqfd_set_irq: inject the IRQ corresponding to the - * irqchip routing entry - * - * This is the entry point for irqfd IRQ injection - */ -static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, - int level, bool line_status) -{ - unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS; - - if (!vgic_valid_spi(kvm, spi_id)) - return -EINVAL; - return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL); -} - -/** - * kvm_set_routing_entry: populate a kvm routing entry - * from a user routing entry - * - * @kvm: the VM this entry is applied to - * @e: kvm kernel routing entry handle - * @ue: user api routing entry handle - * return 0 on success, -EINVAL on errors. - */ -int kvm_set_routing_entry(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *e, - const struct kvm_irq_routing_entry *ue) -{ - int r = -EINVAL; - - switch (ue->type) { - case KVM_IRQ_ROUTING_IRQCHIP: - e->set = vgic_irqfd_set_irq; - e->irqchip.irqchip = ue->u.irqchip.irqchip; - e->irqchip.pin = ue->u.irqchip.pin; - if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) || - (e->irqchip.irqchip >= KVM_NR_IRQCHIPS)) - goto out; - break; - case KVM_IRQ_ROUTING_MSI: - e->set = kvm_set_msi; - e->msi.address_lo = ue->u.msi.address_lo; - e->msi.address_hi = ue->u.msi.address_hi; - e->msi.data = ue->u.msi.data; - e->msi.flags = ue->flags; - e->msi.devid = ue->u.msi.devid; - break; - default: - goto out; - } - r = 0; -out: - return r; -} - -static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm_msi *msi) -{ - msi->address_lo = e->msi.address_lo; - msi->address_hi = e->msi.address_hi; - msi->data = e->msi.data; - msi->flags = e->msi.flags; - msi->devid = e->msi.devid; -} -/** - * kvm_set_msi: inject the MSI corresponding to the - * MSI routing entry - * - * This is the entry point for irqfd MSI injection - * and userspace MSI injection. - */ -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, - int level, bool line_status) -{ - struct kvm_msi msi; - - if (!vgic_has_its(kvm)) - return -ENODEV; - - if (!level) - return -1; - - kvm_populate_msi(e, &msi); - return vgic_its_inject_msi(kvm, &msi); -} - -/** - * kvm_arch_set_irq_inatomic: fast-path for irqfd injection - * - * Currently only direct MSI injection is supported. - */ -int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - if (e->type == KVM_IRQ_ROUTING_MSI && vgic_has_its(kvm) && level) { - struct kvm_msi msi; - - kvm_populate_msi(e, &msi); - if (!vgic_its_inject_cached_translation(kvm, &msi)) - return 0; - } - - return -EWOULDBLOCK; -} - -int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) -{ - struct kvm_irq_routing_entry *entries; - struct vgic_dist *dist = &kvm->arch.vgic; - u32 nr = dist->nr_spis; - int i, ret; - - entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL); - if (!entries) - return -ENOMEM; - - for (i = 0; i < nr; i++) { - entries[i].gsi = i; - entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; - entries[i].u.irqchip.irqchip = 0; - entries[i].u.irqchip.pin = i; - } - ret = kvm_set_irq_routing(kvm, entries, nr, 0); - kfree(entries); - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c deleted file mode 100644 index c012a52b19f5..000000000000 --- a/virt/kvm/arm/vgic/vgic-its.c +++ /dev/null @@ -1,2783 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * GICv3 ITS emulation - * - * Copyright (C) 2015,2016 ARM Ltd. - * Author: Andre Przywara - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "vgic.h" -#include "vgic-mmio.h" - -static int vgic_its_save_tables_v0(struct vgic_its *its); -static int vgic_its_restore_tables_v0(struct vgic_its *its); -static int vgic_its_commit_v0(struct vgic_its *its); -static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, - struct kvm_vcpu *filter_vcpu, bool needs_inv); - -/* - * Creates a new (reference to a) struct vgic_irq for a given LPI. - * If this LPI is already mapped on another ITS, we increase its refcount - * and return a pointer to the existing structure. - * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq. - * This function returns a pointer to the _unlocked_ structure. - */ -static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, - struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq; - unsigned long flags; - int ret; - - /* In this case there is no put, since we keep the reference. */ - if (irq) - return irq; - - irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); - if (!irq) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&irq->lpi_list); - INIT_LIST_HEAD(&irq->ap_list); - raw_spin_lock_init(&irq->irq_lock); - - irq->config = VGIC_CONFIG_EDGE; - kref_init(&irq->refcount); - irq->intid = intid; - irq->target_vcpu = vcpu; - irq->group = 1; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - - /* - * There could be a race with another vgic_add_lpi(), so we need to - * check that we don't add a second list entry with the same LPI. - */ - list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { - if (oldirq->intid != intid) - continue; - - /* Someone was faster with adding this LPI, lets use that. */ - kfree(irq); - irq = oldirq; - - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() on the returned pointer once it's - * finished with the IRQ. - */ - vgic_get_irq_kref(irq); - - goto out_unlock; - } - - list_add_tail(&irq->lpi_list, &dist->lpi_list_head); - dist->lpi_list_count++; - -out_unlock: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - /* - * We "cache" the configuration table entries in our struct vgic_irq's. - * However we only have those structs for mapped IRQs, so we read in - * the respective config data from memory here upon mapping the LPI. - * - * Should any of these fail, behave as if we couldn't create the LPI - * by dropping the refcount and returning the error. - */ - ret = update_lpi_config(kvm, irq, NULL, false); - if (ret) { - vgic_put_irq(kvm, irq); - return ERR_PTR(ret); - } - - ret = vgic_v3_lpi_sync_pending_status(kvm, irq); - if (ret) { - vgic_put_irq(kvm, irq); - return ERR_PTR(ret); - } - - return irq; -} - -struct its_device { - struct list_head dev_list; - - /* the head for the list of ITTEs */ - struct list_head itt_head; - u32 num_eventid_bits; - gpa_t itt_addr; - u32 device_id; -}; - -#define COLLECTION_NOT_MAPPED ((u32)~0) - -struct its_collection { - struct list_head coll_list; - - u32 collection_id; - u32 target_addr; -}; - -#define its_is_collection_mapped(coll) ((coll) && \ - ((coll)->target_addr != COLLECTION_NOT_MAPPED)) - -struct its_ite { - struct list_head ite_list; - - struct vgic_irq *irq; - struct its_collection *collection; - u32 event_id; -}; - -struct vgic_translation_cache_entry { - struct list_head entry; - phys_addr_t db; - u32 devid; - u32 eventid; - struct vgic_irq *irq; -}; - -/** - * struct vgic_its_abi - ITS abi ops and settings - * @cte_esz: collection table entry size - * @dte_esz: device table entry size - * @ite_esz: interrupt translation table entry size - * @save tables: save the ITS tables into guest RAM - * @restore_tables: restore the ITS internal structs from tables - * stored in guest RAM - * @commit: initialize the registers which expose the ABI settings, - * especially the entry sizes - */ -struct vgic_its_abi { - int cte_esz; - int dte_esz; - int ite_esz; - int (*save_tables)(struct vgic_its *its); - int (*restore_tables)(struct vgic_its *its); - int (*commit)(struct vgic_its *its); -}; - -#define ABI_0_ESZ 8 -#define ESZ_MAX ABI_0_ESZ - -static const struct vgic_its_abi its_table_abi_versions[] = { - [0] = { - .cte_esz = ABI_0_ESZ, - .dte_esz = ABI_0_ESZ, - .ite_esz = ABI_0_ESZ, - .save_tables = vgic_its_save_tables_v0, - .restore_tables = vgic_its_restore_tables_v0, - .commit = vgic_its_commit_v0, - }, -}; - -#define NR_ITS_ABIS ARRAY_SIZE(its_table_abi_versions) - -inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its) -{ - return &its_table_abi_versions[its->abi_rev]; -} - -static int vgic_its_set_abi(struct vgic_its *its, u32 rev) -{ - const struct vgic_its_abi *abi; - - its->abi_rev = rev; - abi = vgic_its_get_abi(its); - return abi->commit(its); -} - -/* - * Find and returns a device in the device table for an ITS. - * Must be called with the its_lock mutex held. - */ -static struct its_device *find_its_device(struct vgic_its *its, u32 device_id) -{ - struct its_device *device; - - list_for_each_entry(device, &its->device_list, dev_list) - if (device_id == device->device_id) - return device; - - return NULL; -} - -/* - * Find and returns an interrupt translation table entry (ITTE) for a given - * Device ID/Event ID pair on an ITS. - * Must be called with the its_lock mutex held. - */ -static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, - u32 event_id) -{ - struct its_device *device; - struct its_ite *ite; - - device = find_its_device(its, device_id); - if (device == NULL) - return NULL; - - list_for_each_entry(ite, &device->itt_head, ite_list) - if (ite->event_id == event_id) - return ite; - - return NULL; -} - -/* To be used as an iterator this macro misses the enclosing parentheses */ -#define for_each_lpi_its(dev, ite, its) \ - list_for_each_entry(dev, &(its)->device_list, dev_list) \ - list_for_each_entry(ite, &(dev)->itt_head, ite_list) - -#define GIC_LPI_OFFSET 8192 - -#define VITS_TYPER_IDBITS 16 -#define VITS_TYPER_DEVBITS 16 -#define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1) -#define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1) - -/* - * Finds and returns a collection in the ITS collection table. - * Must be called with the its_lock mutex held. - */ -static struct its_collection *find_collection(struct vgic_its *its, int coll_id) -{ - struct its_collection *collection; - - list_for_each_entry(collection, &its->collection_list, coll_list) { - if (coll_id == collection->collection_id) - return collection; - } - - return NULL; -} - -#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) -#define LPI_PROP_PRIORITY(p) ((p) & 0xfc) - -/* - * Reads the configuration data for a given LPI from guest memory and - * updates the fields in struct vgic_irq. - * If filter_vcpu is not NULL, applies only if the IRQ is targeting this - * VCPU. Unconditionally applies if filter_vcpu is NULL. - */ -static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, - struct kvm_vcpu *filter_vcpu, bool needs_inv) -{ - u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); - u8 prop; - int ret; - unsigned long flags; - - ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET, - &prop, 1); - - if (ret) - return ret; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { - irq->priority = LPI_PROP_PRIORITY(prop); - irq->enabled = LPI_PROP_ENABLE_BIT(prop); - - if (!irq->hw) { - vgic_queue_irq_unlock(kvm, irq, flags); - return 0; - } - } - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - if (irq->hw) - return its_prop_update_vlpi(irq->host_irq, prop, needs_inv); - - return 0; -} - -/* - * Create a snapshot of the current LPIs targeting @vcpu, so that we can - * enumerate those LPIs without holding any lock. - * Returns their number and puts the kmalloc'ed array into intid_ptr. - */ -int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq; - unsigned long flags; - u32 *intids; - int irq_count, i = 0; - - /* - * There is an obvious race between allocating the array and LPIs - * being mapped/unmapped. If we ended up here as a result of a - * command, we're safe (locks are held, preventing another - * command). If coming from another path (such as enabling LPIs), - * we must be careful not to overrun the array. - */ - irq_count = READ_ONCE(dist->lpi_list_count); - intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); - if (!intids) - return -ENOMEM; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - if (i == irq_count) - break; - /* We don't need to "get" the IRQ, as we hold the list lock. */ - if (vcpu && irq->target_vcpu != vcpu) - continue; - intids[i++] = irq->intid; - } - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - *intid_ptr = intids; - return i; -} - -static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) -{ - int ret = 0; - unsigned long flags; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->target_vcpu = vcpu; - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - if (irq->hw) { - struct its_vlpi_map map; - - ret = its_get_vlpi(irq->host_irq, &map); - if (ret) - return ret; - - if (map.vpe) - atomic_dec(&map.vpe->vlpi_count); - map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - atomic_inc(&map.vpe->vlpi_count); - - ret = its_map_vlpi(irq->host_irq, &map); - } - - return ret; -} - -/* - * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI - * is targeting) to the VGIC's view, which deals with target VCPUs. - * Needs to be called whenever either the collection for a LPIs has - * changed or the collection itself got retargeted. - */ -static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite) -{ - struct kvm_vcpu *vcpu; - - if (!its_is_collection_mapped(ite->collection)) - return; - - vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); - update_affinity(ite->irq, vcpu); -} - -/* - * Updates the target VCPU for every LPI targeting this collection. - * Must be called with the its_lock mutex held. - */ -static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, - struct its_collection *coll) -{ - struct its_device *device; - struct its_ite *ite; - - for_each_lpi_its(device, ite, its) { - if (!ite->collection || coll != ite->collection) - continue; - - update_affinity_ite(kvm, ite); - } -} - -static u32 max_lpis_propbaser(u64 propbaser) -{ - int nr_idbits = (propbaser & 0x1f) + 1; - - return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS); -} - -/* - * Sync the pending table pending bit of LPIs targeting @vcpu - * with our own data structures. This relies on the LPI being - * mapped before. - */ -static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) -{ - gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); - struct vgic_irq *irq; - int last_byte_offset = -1; - int ret = 0; - u32 *intids; - int nr_irqs, i; - unsigned long flags; - u8 pendmask; - - nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids); - if (nr_irqs < 0) - return nr_irqs; - - for (i = 0; i < nr_irqs; i++) { - int byte_offset, bit_nr; - - byte_offset = intids[i] / BITS_PER_BYTE; - bit_nr = intids[i] % BITS_PER_BYTE; - - /* - * For contiguously allocated LPIs chances are we just read - * this very same byte in the last iteration. Reuse that. - */ - if (byte_offset != last_byte_offset) { - ret = kvm_read_guest_lock(vcpu->kvm, - pendbase + byte_offset, - &pendmask, 1); - if (ret) { - kfree(intids); - return ret; - } - last_byte_offset = byte_offset; - } - - irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = pendmask & (1U << bit_nr); - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - vgic_put_irq(vcpu->kvm, irq); - } - - kfree(intids); - - return ret; -} - -static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 reg = GITS_TYPER_PLPIS; - - /* - * We use linear CPU numbers for redistributor addressing, - * so GITS_TYPER.PTA is 0. - * Also we force all PROPBASER registers to be the same, so - * CommonLPIAff is 0 as well. - * To avoid memory waste in the guest, we keep the number of IDBits and - * DevBits low - as least for the time being. - */ - reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT; - reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT; - reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT; - - return extract_bytes(reg, addr & 7, len); -} - -static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u32 val; - - val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK; - val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM; - return val; -} - -static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 rev = GITS_IIDR_REV(val); - - if (rev >= NR_ITS_ABIS) - return -EINVAL; - return vgic_its_set_abi(its, rev); -} - -static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - switch (addr & 0xffff) { - case GITS_PIDR0: - return 0x92; /* part number, bits[7:0] */ - case GITS_PIDR1: - return 0xb4; /* part number, bits[11:8] */ - case GITS_PIDR2: - return GIC_PIDR2_ARCH_GICv3 | 0x0b; - case GITS_PIDR4: - return 0x40; /* This is a 64K software visible page */ - /* The following are the ID registers for (any) GIC. */ - case GITS_CIDR0: - return 0x0d; - case GITS_CIDR1: - return 0xf0; - case GITS_CIDR2: - return 0x05; - case GITS_CIDR3: - return 0xb1; - } - - return 0; -} - -static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist, - phys_addr_t db, - u32 devid, u32 eventid) -{ - struct vgic_translation_cache_entry *cte; - - list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { - /* - * If we hit a NULL entry, there is nothing after this - * point. - */ - if (!cte->irq) - break; - - if (cte->db != db || cte->devid != devid || - cte->eventid != eventid) - continue; - - /* - * Move this entry to the head, as it is the most - * recently used. - */ - if (!list_is_first(&cte->entry, &dist->lpi_translation_cache)) - list_move(&cte->entry, &dist->lpi_translation_cache); - - return cte->irq; - } - - return NULL; -} - -static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, - u32 devid, u32 eventid) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq; - unsigned long flags; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - irq = __vgic_its_check_cache(dist, db, devid, eventid); - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - return irq; -} - -static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid, - struct vgic_irq *irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_translation_cache_entry *cte; - unsigned long flags; - phys_addr_t db; - - /* Do not cache a directly injected interrupt */ - if (irq->hw) - return; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - - if (unlikely(list_empty(&dist->lpi_translation_cache))) - goto out; - - /* - * We could have raced with another CPU caching the same - * translation behind our back, so let's check it is not in - * already - */ - db = its->vgic_its_base + GITS_TRANSLATER; - if (__vgic_its_check_cache(dist, db, devid, eventid)) - goto out; - - /* Always reuse the last entry (LRU policy) */ - cte = list_last_entry(&dist->lpi_translation_cache, - typeof(*cte), entry); - - /* - * Caching the translation implies having an extra reference - * to the interrupt, so drop the potential reference on what - * was in the cache, and increment it on the new interrupt. - */ - if (cte->irq) - __vgic_put_lpi_locked(kvm, cte->irq); - - vgic_get_irq_kref(irq); - - cte->db = db; - cte->devid = devid; - cte->eventid = eventid; - cte->irq = irq; - - /* Move the new translation to the head of the list */ - list_move(&cte->entry, &dist->lpi_translation_cache); - -out: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); -} - -void vgic_its_invalidate_cache(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_translation_cache_entry *cte; - unsigned long flags; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - - list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { - /* - * If we hit a NULL entry, there is nothing after this - * point. - */ - if (!cte->irq) - break; - - __vgic_put_lpi_locked(kvm, cte->irq); - cte->irq = NULL; - } - - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); -} - -int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid, struct vgic_irq **irq) -{ - struct kvm_vcpu *vcpu; - struct its_ite *ite; - - if (!its->enabled) - return -EBUSY; - - ite = find_ite(its, devid, eventid); - if (!ite || !its_is_collection_mapped(ite->collection)) - return E_ITS_INT_UNMAPPED_INTERRUPT; - - vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); - if (!vcpu) - return E_ITS_INT_UNMAPPED_INTERRUPT; - - if (!vcpu->arch.vgic_cpu.lpis_enabled) - return -EBUSY; - - vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq); - - *irq = ite->irq; - return 0; -} - -struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) -{ - u64 address; - struct kvm_io_device *kvm_io_dev; - struct vgic_io_device *iodev; - - if (!vgic_has_its(kvm)) - return ERR_PTR(-ENODEV); - - if (!(msi->flags & KVM_MSI_VALID_DEVID)) - return ERR_PTR(-EINVAL); - - address = (u64)msi->address_hi << 32 | msi->address_lo; - - kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); - if (!kvm_io_dev) - return ERR_PTR(-EINVAL); - - if (kvm_io_dev->ops != &kvm_io_gic_ops) - return ERR_PTR(-EINVAL); - - iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); - if (iodev->iodev_type != IODEV_ITS) - return ERR_PTR(-EINVAL); - - return iodev->its; -} - -/* - * Find the target VCPU and the LPI number for a given devid/eventid pair - * and make this IRQ pending, possibly injecting it. - * Must be called with the its_lock mutex held. - * Returns 0 on success, a positive error value for any ITS mapping - * related errors and negative error values for generic errors. - */ -static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid) -{ - struct vgic_irq *irq = NULL; - unsigned long flags; - int err; - - err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq); - if (err) - return err; - - if (irq->hw) - return irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, true); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - vgic_queue_irq_unlock(kvm, irq, flags); - - return 0; -} - -int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) -{ - struct vgic_irq *irq; - unsigned long flags; - phys_addr_t db; - - db = (u64)msi->address_hi << 32 | msi->address_lo; - irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data); - - if (!irq) - return -1; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - vgic_queue_irq_unlock(kvm, irq, flags); - - return 0; -} - -/* - * Queries the KVM IO bus framework to get the ITS pointer from the given - * doorbell address. - * We then call vgic_its_trigger_msi() with the decoded data. - * According to the KVM_SIGNAL_MSI API description returns 1 on success. - */ -int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) -{ - struct vgic_its *its; - int ret; - - if (!vgic_its_inject_cached_translation(kvm, msi)) - return 1; - - its = vgic_msi_to_its(kvm, msi); - if (IS_ERR(its)) - return PTR_ERR(its); - - mutex_lock(&its->its_lock); - ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data); - mutex_unlock(&its->its_lock); - - if (ret < 0) - return ret; - - /* - * KVM_SIGNAL_MSI demands a return value > 0 for success and 0 - * if the guest has blocked the MSI. So we map any LPI mapping - * related error to that. - */ - if (ret) - return 0; - else - return 1; -} - -/* Requires the its_lock to be held. */ -static void its_free_ite(struct kvm *kvm, struct its_ite *ite) -{ - list_del(&ite->ite_list); - - /* This put matches the get in vgic_add_lpi. */ - if (ite->irq) { - if (ite->irq->hw) - WARN_ON(its_unmap_vlpi(ite->irq->host_irq)); - - vgic_put_irq(kvm, ite->irq); - } - - kfree(ite); -} - -static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size) -{ - return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1); -} - -#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8) -#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32) -#define its_cmd_get_size(cmd) (its_cmd_mask_field(cmd, 1, 0, 5) + 1) -#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32) -#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32) -#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16) -#define its_cmd_get_ittaddr(cmd) (its_cmd_mask_field(cmd, 2, 8, 44) << 8) -#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32) -#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1) - -/* - * The DISCARD command frees an Interrupt Translation Table Entry (ITTE). - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - struct its_ite *ite; - - ite = find_ite(its, device_id, event_id); - if (ite && its_is_collection_mapped(ite->collection)) { - /* - * Though the spec talks about removing the pending state, we - * don't bother here since we clear the ITTE anyway and the - * pending state is a property of the ITTE struct. - */ - vgic_its_invalidate_cache(kvm); - - its_free_ite(kvm, ite); - return 0; - } - - return E_ITS_DISCARD_UNMAPPED_INTERRUPT; -} - -/* - * The MOVI command moves an ITTE to a different collection. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - u32 coll_id = its_cmd_get_collection(its_cmd); - struct kvm_vcpu *vcpu; - struct its_ite *ite; - struct its_collection *collection; - - ite = find_ite(its, device_id, event_id); - if (!ite) - return E_ITS_MOVI_UNMAPPED_INTERRUPT; - - if (!its_is_collection_mapped(ite->collection)) - return E_ITS_MOVI_UNMAPPED_COLLECTION; - - collection = find_collection(its, coll_id); - if (!its_is_collection_mapped(collection)) - return E_ITS_MOVI_UNMAPPED_COLLECTION; - - ite->collection = collection; - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - vgic_its_invalidate_cache(kvm); - - return update_affinity(ite->irq, vcpu); -} - -/* - * Check whether an ID can be stored into the corresponding guest table. - * For a direct table this is pretty easy, but gets a bit nasty for - * indirect tables. We check whether the resulting guest physical address - * is actually valid (covered by a memslot and guest accessible). - * For this we have to read the respective first level entry. - */ -static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, - gpa_t *eaddr) -{ - int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; - u64 indirect_ptr, type = GITS_BASER_TYPE(baser); - phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); - int esz = GITS_BASER_ENTRY_SIZE(baser); - int index, idx; - gfn_t gfn; - bool ret; - - switch (type) { - case GITS_BASER_TYPE_DEVICE: - if (id >= BIT_ULL(VITS_TYPER_DEVBITS)) - return false; - break; - case GITS_BASER_TYPE_COLLECTION: - /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */ - if (id >= BIT_ULL(16)) - return false; - break; - default: - return false; - } - - if (!(baser & GITS_BASER_INDIRECT)) { - phys_addr_t addr; - - if (id >= (l1_tbl_size / esz)) - return false; - - addr = base + id * esz; - gfn = addr >> PAGE_SHIFT; - - if (eaddr) - *eaddr = addr; - - goto out; - } - - /* calculate and check the index into the 1st level */ - index = id / (SZ_64K / esz); - if (index >= (l1_tbl_size / sizeof(u64))) - return false; - - /* Each 1st level entry is represented by a 64-bit value. */ - if (kvm_read_guest_lock(its->dev->kvm, - base + index * sizeof(indirect_ptr), - &indirect_ptr, sizeof(indirect_ptr))) - return false; - - indirect_ptr = le64_to_cpu(indirect_ptr); - - /* check the valid bit of the first level entry */ - if (!(indirect_ptr & BIT_ULL(63))) - return false; - - /* Mask the guest physical address and calculate the frame number. */ - indirect_ptr &= GENMASK_ULL(51, 16); - - /* Find the address of the actual entry */ - index = id % (SZ_64K / esz); - indirect_ptr += index * esz; - gfn = indirect_ptr >> PAGE_SHIFT; - - if (eaddr) - *eaddr = indirect_ptr; - -out: - idx = srcu_read_lock(&its->dev->kvm->srcu); - ret = kvm_is_visible_gfn(its->dev->kvm, gfn); - srcu_read_unlock(&its->dev->kvm->srcu, idx); - return ret; -} - -static int vgic_its_alloc_collection(struct vgic_its *its, - struct its_collection **colp, - u32 coll_id) -{ - struct its_collection *collection; - - if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) - return E_ITS_MAPC_COLLECTION_OOR; - - collection = kzalloc(sizeof(*collection), GFP_KERNEL); - if (!collection) - return -ENOMEM; - - collection->collection_id = coll_id; - collection->target_addr = COLLECTION_NOT_MAPPED; - - list_add_tail(&collection->coll_list, &its->collection_list); - *colp = collection; - - return 0; -} - -static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) -{ - struct its_collection *collection; - struct its_device *device; - struct its_ite *ite; - - /* - * Clearing the mapping for that collection ID removes the - * entry from the list. If there wasn't any before, we can - * go home early. - */ - collection = find_collection(its, coll_id); - if (!collection) - return; - - for_each_lpi_its(device, ite, its) - if (ite->collection && - ite->collection->collection_id == coll_id) - ite->collection = NULL; - - list_del(&collection->coll_list); - kfree(collection); -} - -/* Must be called with its_lock mutex held */ -static struct its_ite *vgic_its_alloc_ite(struct its_device *device, - struct its_collection *collection, - u32 event_id) -{ - struct its_ite *ite; - - ite = kzalloc(sizeof(*ite), GFP_KERNEL); - if (!ite) - return ERR_PTR(-ENOMEM); - - ite->event_id = event_id; - ite->collection = collection; - - list_add_tail(&ite->ite_list, &device->itt_head); - return ite; -} - -/* - * The MAPTI and MAPI commands map LPIs to ITTEs. - * Must be called with its_lock mutex held. - */ -static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - u32 coll_id = its_cmd_get_collection(its_cmd); - struct its_ite *ite; - struct kvm_vcpu *vcpu = NULL; - struct its_device *device; - struct its_collection *collection, *new_coll = NULL; - struct vgic_irq *irq; - int lpi_nr; - - device = find_its_device(its, device_id); - if (!device) - return E_ITS_MAPTI_UNMAPPED_DEVICE; - - if (event_id >= BIT_ULL(device->num_eventid_bits)) - return E_ITS_MAPTI_ID_OOR; - - if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) - lpi_nr = its_cmd_get_physical_id(its_cmd); - else - lpi_nr = event_id; - if (lpi_nr < GIC_LPI_OFFSET || - lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) - return E_ITS_MAPTI_PHYSICALID_OOR; - - /* If there is an existing mapping, behavior is UNPREDICTABLE. */ - if (find_ite(its, device_id, event_id)) - return 0; - - collection = find_collection(its, coll_id); - if (!collection) { - int ret = vgic_its_alloc_collection(its, &collection, coll_id); - if (ret) - return ret; - new_coll = collection; - } - - ite = vgic_its_alloc_ite(device, collection, event_id); - if (IS_ERR(ite)) { - if (new_coll) - vgic_its_free_collection(its, coll_id); - return PTR_ERR(ite); - } - - if (its_is_collection_mapped(collection)) - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - irq = vgic_add_lpi(kvm, lpi_nr, vcpu); - if (IS_ERR(irq)) { - if (new_coll) - vgic_its_free_collection(its, coll_id); - its_free_ite(kvm, ite); - return PTR_ERR(irq); - } - ite->irq = irq; - - return 0; -} - -/* Requires the its_lock to be held. */ -static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) -{ - struct its_ite *ite, *temp; - - /* - * The spec says that unmapping a device with still valid - * ITTEs associated is UNPREDICTABLE. We remove all ITTEs, - * since we cannot leave the memory unreferenced. - */ - list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list) - its_free_ite(kvm, ite); - - vgic_its_invalidate_cache(kvm); - - list_del(&device->dev_list); - kfree(device); -} - -/* its lock must be held */ -static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) -{ - struct its_device *cur, *temp; - - list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) - vgic_its_free_device(kvm, cur); -} - -/* its lock must be held */ -static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its) -{ - struct its_collection *cur, *temp; - - list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list) - vgic_its_free_collection(its, cur->collection_id); -} - -/* Must be called with its_lock mutex held */ -static struct its_device *vgic_its_alloc_device(struct vgic_its *its, - u32 device_id, gpa_t itt_addr, - u8 num_eventid_bits) -{ - struct its_device *device; - - device = kzalloc(sizeof(*device), GFP_KERNEL); - if (!device) - return ERR_PTR(-ENOMEM); - - device->device_id = device_id; - device->itt_addr = itt_addr; - device->num_eventid_bits = num_eventid_bits; - INIT_LIST_HEAD(&device->itt_head); - - list_add_tail(&device->dev_list, &its->device_list); - return device; -} - -/* - * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - bool valid = its_cmd_get_validbit(its_cmd); - u8 num_eventid_bits = its_cmd_get_size(its_cmd); - gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); - struct its_device *device; - - if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL)) - return E_ITS_MAPD_DEVICE_OOR; - - if (valid && num_eventid_bits > VITS_TYPER_IDBITS) - return E_ITS_MAPD_ITTSIZE_OOR; - - device = find_its_device(its, device_id); - - /* - * The spec says that calling MAPD on an already mapped device - * invalidates all cached data for this device. We implement this - * by removing the mapping and re-establishing it. - */ - if (device) - vgic_its_free_device(kvm, device); - - /* - * The spec does not say whether unmapping a not-mapped device - * is an error, so we are done in any case. - */ - if (!valid) - return 0; - - device = vgic_its_alloc_device(its, device_id, itt_addr, - num_eventid_bits); - - return PTR_ERR_OR_ZERO(device); -} - -/* - * The MAPC command maps collection IDs to redistributors. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u16 coll_id; - u32 target_addr; - struct its_collection *collection; - bool valid; - - valid = its_cmd_get_validbit(its_cmd); - coll_id = its_cmd_get_collection(its_cmd); - target_addr = its_cmd_get_target_addr(its_cmd); - - if (target_addr >= atomic_read(&kvm->online_vcpus)) - return E_ITS_MAPC_PROCNUM_OOR; - - if (!valid) { - vgic_its_free_collection(its, coll_id); - vgic_its_invalidate_cache(kvm); - } else { - collection = find_collection(its, coll_id); - - if (!collection) { - int ret; - - ret = vgic_its_alloc_collection(its, &collection, - coll_id); - if (ret) - return ret; - collection->target_addr = target_addr; - } else { - collection->target_addr = target_addr; - update_affinity_collection(kvm, its, collection); - } - } - - return 0; -} - -/* - * The CLEAR command removes the pending state for a particular LPI. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - struct its_ite *ite; - - - ite = find_ite(its, device_id, event_id); - if (!ite) - return E_ITS_CLEAR_UNMAPPED_INTERRUPT; - - ite->irq->pending_latch = false; - - if (ite->irq->hw) - return irq_set_irqchip_state(ite->irq->host_irq, - IRQCHIP_STATE_PENDING, false); - - return 0; -} - -/* - * The INV command syncs the configuration bits from the memory table. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - struct its_ite *ite; - - - ite = find_ite(its, device_id, event_id); - if (!ite) - return E_ITS_INV_UNMAPPED_INTERRUPT; - - return update_lpi_config(kvm, ite->irq, NULL, true); -} - -/* - * The INVALL command requests flushing of all IRQ data in this collection. - * Find the VCPU mapped to that collection, then iterate over the VM's list - * of mapped LPIs and update the configuration for each IRQ which targets - * the specified vcpu. The configuration will be read from the in-memory - * configuration table. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 coll_id = its_cmd_get_collection(its_cmd); - struct its_collection *collection; - struct kvm_vcpu *vcpu; - struct vgic_irq *irq; - u32 *intids; - int irq_count, i; - - collection = find_collection(its, coll_id); - if (!its_is_collection_mapped(collection)) - return E_ITS_INVALL_UNMAPPED_COLLECTION; - - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); - if (irq_count < 0) - return irq_count; - - for (i = 0; i < irq_count; i++) { - irq = vgic_get_irq(kvm, NULL, intids[i]); - if (!irq) - continue; - update_lpi_config(kvm, irq, vcpu, false); - vgic_put_irq(kvm, irq); - } - - kfree(intids); - - if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) - its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); - - return 0; -} - -/* - * The MOVALL command moves the pending state of all IRQs targeting one - * redistributor to another. We don't hold the pending state in the VCPUs, - * but in the IRQs instead, so there is really not much to do for us here. - * However the spec says that no IRQ must target the old redistributor - * afterwards, so we make sure that no LPI is using the associated target_vcpu. - * This command affects all LPIs in the system that target that redistributor. - */ -static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 target1_addr = its_cmd_get_target_addr(its_cmd); - u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32); - struct kvm_vcpu *vcpu1, *vcpu2; - struct vgic_irq *irq; - u32 *intids; - int irq_count, i; - - if (target1_addr >= atomic_read(&kvm->online_vcpus) || - target2_addr >= atomic_read(&kvm->online_vcpus)) - return E_ITS_MOVALL_PROCNUM_OOR; - - if (target1_addr == target2_addr) - return 0; - - vcpu1 = kvm_get_vcpu(kvm, target1_addr); - vcpu2 = kvm_get_vcpu(kvm, target2_addr); - - irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids); - if (irq_count < 0) - return irq_count; - - for (i = 0; i < irq_count; i++) { - irq = vgic_get_irq(kvm, NULL, intids[i]); - - update_affinity(irq, vcpu2); - - vgic_put_irq(kvm, irq); - } - - vgic_its_invalidate_cache(kvm); - - kfree(intids); - return 0; -} - -/* - * The INT command injects the LPI associated with that DevID/EvID pair. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 msi_data = its_cmd_get_id(its_cmd); - u64 msi_devid = its_cmd_get_deviceid(its_cmd); - - return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); -} - -/* - * This function is called with the its_cmd lock held, but the ITS data - * structure lock dropped. - */ -static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - int ret = -ENODEV; - - mutex_lock(&its->its_lock); - switch (its_cmd_get_command(its_cmd)) { - case GITS_CMD_MAPD: - ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); - break; - case GITS_CMD_MAPC: - ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); - break; - case GITS_CMD_MAPI: - ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); - break; - case GITS_CMD_MAPTI: - ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); - break; - case GITS_CMD_MOVI: - ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); - break; - case GITS_CMD_DISCARD: - ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd); - break; - case GITS_CMD_CLEAR: - ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd); - break; - case GITS_CMD_MOVALL: - ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); - break; - case GITS_CMD_INT: - ret = vgic_its_cmd_handle_int(kvm, its, its_cmd); - break; - case GITS_CMD_INV: - ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); - break; - case GITS_CMD_INVALL: - ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd); - break; - case GITS_CMD_SYNC: - /* we ignore this command: we are in sync all of the time */ - ret = 0; - break; - } - mutex_unlock(&its->its_lock); - - return ret; -} - -static u64 vgic_sanitise_its_baser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK, - GITS_BASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK, - GITS_BASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK, - GITS_BASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - /* We support only one (ITS) page size: 64K */ - reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; - - return reg; -} - -static u64 vgic_sanitise_its_cbaser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK, - GITS_CBASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK, - GITS_CBASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK, - GITS_CBASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - /* Sanitise the physical address to be 64k aligned. */ - reg &= ~GENMASK_ULL(15, 12); - - return reg; -} - -static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return extract_bytes(its->cbaser, addr & 7, len); -} - -static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - /* When GITS_CTLR.Enable is 1, this register is RO. */ - if (its->enabled) - return; - - mutex_lock(&its->cmd_lock); - its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val); - its->cbaser = vgic_sanitise_its_cbaser(its->cbaser); - its->creadr = 0; - /* - * CWRITER is architecturally UNKNOWN on reset, but we need to reset - * it to CREADR to make sure we start with an empty command buffer. - */ - its->cwriter = its->creadr; - mutex_unlock(&its->cmd_lock); -} - -#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12) -#define ITS_CMD_SIZE 32 -#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) - -/* Must be called with the cmd_lock held. */ -static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) -{ - gpa_t cbaser; - u64 cmd_buf[4]; - - /* Commands are only processed when the ITS is enabled. */ - if (!its->enabled) - return; - - cbaser = GITS_CBASER_ADDRESS(its->cbaser); - - while (its->cwriter != its->creadr) { - int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, - cmd_buf, ITS_CMD_SIZE); - /* - * If kvm_read_guest() fails, this could be due to the guest - * programming a bogus value in CBASER or something else going - * wrong from which we cannot easily recover. - * According to section 6.3.2 in the GICv3 spec we can just - * ignore that command then. - */ - if (!ret) - vgic_its_handle_command(kvm, its, cmd_buf); - - its->creadr += ITS_CMD_SIZE; - if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) - its->creadr = 0; - } -} - -/* - * By writing to CWRITER the guest announces new commands to be processed. - * To avoid any races in the first place, we take the its_cmd lock, which - * protects our ring buffer variables, so that there is only one user - * per ITS handling commands at a given time. - */ -static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u64 reg; - - if (!its) - return; - - mutex_lock(&its->cmd_lock); - - reg = update_64bit_reg(its->cwriter, addr & 7, len, val); - reg = ITS_CMD_OFFSET(reg); - if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { - mutex_unlock(&its->cmd_lock); - return; - } - its->cwriter = reg; - - vgic_its_process_commands(kvm, its); - - mutex_unlock(&its->cmd_lock); -} - -static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return extract_bytes(its->cwriter, addr & 0x7, len); -} - -static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return extract_bytes(its->creadr, addr & 0x7, len); -} - -static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 cmd_offset; - int ret = 0; - - mutex_lock(&its->cmd_lock); - - if (its->enabled) { - ret = -EBUSY; - goto out; - } - - cmd_offset = ITS_CMD_OFFSET(val); - if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { - ret = -EINVAL; - goto out; - } - - its->creadr = cmd_offset; -out: - mutex_unlock(&its->cmd_lock); - return ret; -} - -#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7) -static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u64 reg; - - switch (BASER_INDEX(addr)) { - case 0: - reg = its->baser_device_table; - break; - case 1: - reg = its->baser_coll_table; - break; - default: - reg = 0; - break; - } - - return extract_bytes(reg, addr & 7, len); -} - -#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) -static void vgic_mmio_write_its_baser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 entry_size, table_type; - u64 reg, *regptr, clearbits = 0; - - /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ - if (its->enabled) - return; - - switch (BASER_INDEX(addr)) { - case 0: - regptr = &its->baser_device_table; - entry_size = abi->dte_esz; - table_type = GITS_BASER_TYPE_DEVICE; - break; - case 1: - regptr = &its->baser_coll_table; - entry_size = abi->cte_esz; - table_type = GITS_BASER_TYPE_COLLECTION; - clearbits = GITS_BASER_INDIRECT; - break; - default: - return; - } - - reg = update_64bit_reg(*regptr, addr & 7, len, val); - reg &= ~GITS_BASER_RO_MASK; - reg &= ~clearbits; - - reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; - reg |= table_type << GITS_BASER_TYPE_SHIFT; - reg = vgic_sanitise_its_baser(reg); - - *regptr = reg; - - if (!(reg & GITS_BASER_VALID)) { - /* Take the its_lock to prevent a race with a save/restore */ - mutex_lock(&its->its_lock); - switch (table_type) { - case GITS_BASER_TYPE_DEVICE: - vgic_its_free_device_list(kvm, its); - break; - case GITS_BASER_TYPE_COLLECTION: - vgic_its_free_collection_list(kvm, its); - break; - } - mutex_unlock(&its->its_lock); - } -} - -static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u32 reg = 0; - - mutex_lock(&its->cmd_lock); - if (its->creadr == its->cwriter) - reg |= GITS_CTLR_QUIESCENT; - if (its->enabled) - reg |= GITS_CTLR_ENABLE; - mutex_unlock(&its->cmd_lock); - - return reg; -} - -static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - mutex_lock(&its->cmd_lock); - - /* - * It is UNPREDICTABLE to enable the ITS if any of the CBASER or - * device/collection BASER are invalid - */ - if (!its->enabled && (val & GITS_CTLR_ENABLE) && - (!(its->baser_device_table & GITS_BASER_VALID) || - !(its->baser_coll_table & GITS_BASER_VALID) || - !(its->cbaser & GITS_CBASER_VALID))) - goto out; - - its->enabled = !!(val & GITS_CTLR_ENABLE); - if (!its->enabled) - vgic_its_invalidate_cache(kvm); - - /* - * Try to process any pending commands. This function bails out early - * if the ITS is disabled or no commands have been queued. - */ - vgic_its_process_commands(kvm, its); - -out: - mutex_unlock(&its->cmd_lock); -} - -#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ -{ \ - .reg_offset = off, \ - .len = length, \ - .access_flags = acc, \ - .its_read = rd, \ - .its_write = wr, \ -} - -#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\ -{ \ - .reg_offset = off, \ - .len = length, \ - .access_flags = acc, \ - .its_read = rd, \ - .its_write = wr, \ - .uaccess_its_write = uwr, \ -} - -static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, unsigned long val) -{ - /* Ignore */ -} - -static struct vgic_register_region its_registers[] = { - REGISTER_ITS_DESC(GITS_CTLR, - vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4, - VGIC_ACCESS_32bit), - REGISTER_ITS_DESC_UACCESS(GITS_IIDR, - vgic_mmio_read_its_iidr, its_mmio_write_wi, - vgic_mmio_uaccess_write_its_iidr, 4, - VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_TYPER, - vgic_mmio_read_its_typer, its_mmio_write_wi, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_CBASER, - vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_CWRITER, - vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC_UACCESS(GITS_CREADR, - vgic_mmio_read_its_creadr, its_mmio_write_wi, - vgic_mmio_uaccess_write_its_creadr, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_BASER, - vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_IDREGS_BASE, - vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30, - VGIC_ACCESS_32bit), -}; - -/* This is called on setting the LPI enable bit in the redistributor. */ -void vgic_enable_lpis(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ)) - its_sync_lpi_pending_table(vcpu); -} - -static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its, - u64 addr) -{ - struct vgic_io_device *iodev = &its->iodev; - int ret; - - mutex_lock(&kvm->slots_lock); - if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { - ret = -EBUSY; - goto out; - } - - its->vgic_its_base = addr; - iodev->regions = its_registers; - iodev->nr_regions = ARRAY_SIZE(its_registers); - kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops); - - iodev->base_addr = its->vgic_its_base; - iodev->iodev_type = IODEV_ITS; - iodev->its = its; - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr, - KVM_VGIC_V3_ITS_SIZE, &iodev->dev); -out: - mutex_unlock(&kvm->slots_lock); - - return ret; -} - -/* Default is 16 cached LPIs per vcpu */ -#define LPI_DEFAULT_PCPU_CACHE_SIZE 16 - -void vgic_lpi_translation_cache_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - unsigned int sz; - int i; - - if (!list_empty(&dist->lpi_translation_cache)) - return; - - sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE; - - for (i = 0; i < sz; i++) { - struct vgic_translation_cache_entry *cte; - - /* An allocation failure is not fatal */ - cte = kzalloc(sizeof(*cte), GFP_KERNEL); - if (WARN_ON(!cte)) - break; - - INIT_LIST_HEAD(&cte->entry); - list_add(&cte->entry, &dist->lpi_translation_cache); - } -} - -void vgic_lpi_translation_cache_destroy(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_translation_cache_entry *cte, *tmp; - - vgic_its_invalidate_cache(kvm); - - list_for_each_entry_safe(cte, tmp, - &dist->lpi_translation_cache, entry) { - list_del(&cte->entry); - kfree(cte); - } -} - -#define INITIAL_BASER_VALUE \ - (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ - GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ - GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \ - GITS_BASER_PAGE_SIZE_64K) - -#define INITIAL_PROPBASER_VALUE \ - (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \ - GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \ - GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)) - -static int vgic_its_create(struct kvm_device *dev, u32 type) -{ - struct vgic_its *its; - - if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) - return -ENODEV; - - its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); - if (!its) - return -ENOMEM; - - if (vgic_initialized(dev->kvm)) { - int ret = vgic_v4_init(dev->kvm); - if (ret < 0) { - kfree(its); - return ret; - } - - vgic_lpi_translation_cache_init(dev->kvm); - } - - mutex_init(&its->its_lock); - mutex_init(&its->cmd_lock); - - its->vgic_its_base = VGIC_ADDR_UNDEF; - - INIT_LIST_HEAD(&its->device_list); - INIT_LIST_HEAD(&its->collection_list); - - dev->kvm->arch.vgic.msis_require_devid = true; - dev->kvm->arch.vgic.has_its = true; - its->enabled = false; - its->dev = dev; - - its->baser_device_table = INITIAL_BASER_VALUE | - ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); - its->baser_coll_table = INITIAL_BASER_VALUE | - ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); - dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; - - dev->private = its; - - return vgic_its_set_abi(its, NR_ITS_ABIS - 1); -} - -static void vgic_its_destroy(struct kvm_device *kvm_dev) -{ - struct kvm *kvm = kvm_dev->kvm; - struct vgic_its *its = kvm_dev->private; - - mutex_lock(&its->its_lock); - - vgic_its_free_device_list(kvm, its); - vgic_its_free_collection_list(kvm, its); - - mutex_unlock(&its->its_lock); - kfree(its); - kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */ -} - -static int vgic_its_has_attr_regs(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - const struct vgic_register_region *region; - gpa_t offset = attr->attr; - int align; - - align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7; - - if (offset & align) - return -EINVAL; - - region = vgic_find_mmio_region(its_registers, - ARRAY_SIZE(its_registers), - offset); - if (!region) - return -ENXIO; - - return 0; -} - -static int vgic_its_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u64 *reg, bool is_write) -{ - const struct vgic_register_region *region; - struct vgic_its *its; - gpa_t addr, offset; - unsigned int len; - int align, ret = 0; - - its = dev->private; - offset = attr->attr; - - /* - * Although the spec supports upper/lower 32-bit accesses to - * 64-bit ITS registers, the userspace ABI requires 64-bit - * accesses to all 64-bit wide registers. We therefore only - * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID - * registers - */ - if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4)) - align = 0x3; - else - align = 0x7; - - if (offset & align) - return -EINVAL; - - mutex_lock(&dev->kvm->lock); - - if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { - ret = -ENXIO; - goto out; - } - - region = vgic_find_mmio_region(its_registers, - ARRAY_SIZE(its_registers), - offset); - if (!region) { - ret = -ENXIO; - goto out; - } - - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - - addr = its->vgic_its_base + offset; - - len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4; - - if (is_write) { - if (region->uaccess_its_write) - ret = region->uaccess_its_write(dev->kvm, its, addr, - len, *reg); - else - region->its_write(dev->kvm, its, addr, len, *reg); - } else { - *reg = region->its_read(dev->kvm, its, addr, len); - } - unlock_all_vcpus(dev->kvm); -out: - mutex_unlock(&dev->kvm->lock); - return ret; -} - -static u32 compute_next_devid_offset(struct list_head *h, - struct its_device *dev) -{ - struct its_device *next; - u32 next_offset; - - if (list_is_last(&dev->dev_list, h)) - return 0; - next = list_next_entry(dev, dev_list); - next_offset = next->device_id - dev->device_id; - - return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET); -} - -static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite) -{ - struct its_ite *next; - u32 next_offset; - - if (list_is_last(&ite->ite_list, h)) - return 0; - next = list_next_entry(ite, ite_list); - next_offset = next->event_id - ite->event_id; - - return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET); -} - -/** - * entry_fn_t - Callback called on a table entry restore path - * @its: its handle - * @id: id of the entry - * @entry: pointer to the entry - * @opaque: pointer to an opaque data - * - * Return: < 0 on error, 0 if last element was identified, id offset to next - * element otherwise - */ -typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, - void *opaque); - -/** - * scan_its_table - Scan a contiguous table in guest RAM and applies a function - * to each entry - * - * @its: its handle - * @base: base gpa of the table - * @size: size of the table in bytes - * @esz: entry size in bytes - * @start_id: the ID of the first entry in the table - * (non zero for 2d level tables) - * @fn: function to apply on each entry - * - * Return: < 0 on error, 0 if last element was identified, 1 otherwise - * (the last element may not be found on second level tables) - */ -static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, - int start_id, entry_fn_t fn, void *opaque) -{ - struct kvm *kvm = its->dev->kvm; - unsigned long len = size; - int id = start_id; - gpa_t gpa = base; - char entry[ESZ_MAX]; - int ret; - - memset(entry, 0, esz); - - while (len > 0) { - int next_offset; - size_t byte_offset; - - ret = kvm_read_guest_lock(kvm, gpa, entry, esz); - if (ret) - return ret; - - next_offset = fn(its, id, entry, opaque); - if (next_offset <= 0) - return next_offset; - - byte_offset = next_offset * esz; - id += next_offset; - gpa += byte_offset; - len -= byte_offset; - } - return 1; -} - -/** - * vgic_its_save_ite - Save an interrupt translation entry at @gpa - */ -static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, - struct its_ite *ite, gpa_t gpa, int ite_esz) -{ - struct kvm *kvm = its->dev->kvm; - u32 next_offset; - u64 val; - - next_offset = compute_next_eventid_offset(&dev->itt_head, ite); - val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) | - ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | - ite->collection->collection_id; - val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); -} - -/** - * vgic_its_restore_ite - restore an interrupt translation entry - * @event_id: id used for indexing - * @ptr: pointer to the ITE entry - * @opaque: pointer to the its_device - */ -static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, - void *ptr, void *opaque) -{ - struct its_device *dev = (struct its_device *)opaque; - struct its_collection *collection; - struct kvm *kvm = its->dev->kvm; - struct kvm_vcpu *vcpu = NULL; - u64 val; - u64 *p = (u64 *)ptr; - struct vgic_irq *irq; - u32 coll_id, lpi_id; - struct its_ite *ite; - u32 offset; - - val = *p; - - val = le64_to_cpu(val); - - coll_id = val & KVM_ITS_ITE_ICID_MASK; - lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT; - - if (!lpi_id) - return 1; /* invalid entry, no choice but to scan next entry */ - - if (lpi_id < VGIC_MIN_LPI) - return -EINVAL; - - offset = val >> KVM_ITS_ITE_NEXT_SHIFT; - if (event_id + offset >= BIT_ULL(dev->num_eventid_bits)) - return -EINVAL; - - collection = find_collection(its, coll_id); - if (!collection) - return -EINVAL; - - ite = vgic_its_alloc_ite(dev, collection, event_id); - if (IS_ERR(ite)) - return PTR_ERR(ite); - - if (its_is_collection_mapped(collection)) - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - irq = vgic_add_lpi(kvm, lpi_id, vcpu); - if (IS_ERR(irq)) - return PTR_ERR(irq); - ite->irq = irq; - - return offset; -} - -static int vgic_its_ite_cmp(void *priv, struct list_head *a, - struct list_head *b) -{ - struct its_ite *itea = container_of(a, struct its_ite, ite_list); - struct its_ite *iteb = container_of(b, struct its_ite, ite_list); - - if (itea->event_id < iteb->event_id) - return -1; - else - return 1; -} - -static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - gpa_t base = device->itt_addr; - struct its_ite *ite; - int ret; - int ite_esz = abi->ite_esz; - - list_sort(NULL, &device->itt_head, vgic_its_ite_cmp); - - list_for_each_entry(ite, &device->itt_head, ite_list) { - gpa_t gpa = base + ite->event_id * ite_esz; - - /* - * If an LPI carries the HW bit, this means that this - * interrupt is controlled by GICv4, and we do not - * have direct access to that state. Let's simply fail - * the save operation... - */ - if (ite->irq->hw) - return -EACCES; - - ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz); - if (ret) - return ret; - } - return 0; -} - -/** - * vgic_its_restore_itt - restore the ITT of a device - * - * @its: its handle - * @dev: device handle - * - * Return 0 on success, < 0 on error - */ -static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - gpa_t base = dev->itt_addr; - int ret; - int ite_esz = abi->ite_esz; - size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz; - - ret = scan_its_table(its, base, max_size, ite_esz, 0, - vgic_its_restore_ite, dev); - - /* scan_its_table returns +1 if all ITEs are invalid */ - if (ret > 0) - ret = 0; - - return ret; -} - -/** - * vgic_its_save_dte - Save a device table entry at a given GPA - * - * @its: ITS handle - * @dev: ITS device - * @ptr: GPA - */ -static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, - gpa_t ptr, int dte_esz) -{ - struct kvm *kvm = its->dev->kvm; - u64 val, itt_addr_field; - u32 next_offset; - - itt_addr_field = dev->itt_addr >> 8; - next_offset = compute_next_devid_offset(&its->device_list, dev); - val = (1ULL << KVM_ITS_DTE_VALID_SHIFT | - ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) | - (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | - (dev->num_eventid_bits - 1)); - val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); -} - -/** - * vgic_its_restore_dte - restore a device table entry - * - * @its: its handle - * @id: device id the DTE corresponds to - * @ptr: kernel VA where the 8 byte DTE is located - * @opaque: unused - * - * Return: < 0 on error, 0 if the dte is the last one, id offset to the - * next dte otherwise - */ -static int vgic_its_restore_dte(struct vgic_its *its, u32 id, - void *ptr, void *opaque) -{ - struct its_device *dev; - gpa_t itt_addr; - u8 num_eventid_bits; - u64 entry = *(u64 *)ptr; - bool valid; - u32 offset; - int ret; - - entry = le64_to_cpu(entry); - - valid = entry >> KVM_ITS_DTE_VALID_SHIFT; - num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1; - itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK) - >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8; - - if (!valid) - return 1; - - /* dte entry is valid */ - offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT; - - dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits); - if (IS_ERR(dev)) - return PTR_ERR(dev); - - ret = vgic_its_restore_itt(its, dev); - if (ret) { - vgic_its_free_device(its->dev->kvm, dev); - return ret; - } - - return offset; -} - -static int vgic_its_device_cmp(void *priv, struct list_head *a, - struct list_head *b) -{ - struct its_device *deva = container_of(a, struct its_device, dev_list); - struct its_device *devb = container_of(b, struct its_device, dev_list); - - if (deva->device_id < devb->device_id) - return -1; - else - return 1; -} - -/** - * vgic_its_save_device_tables - Save the device table and all ITT - * into guest RAM - * - * L1/L2 handling is hidden by vgic_its_check_id() helper which directly - * returns the GPA of the device entry - */ -static int vgic_its_save_device_tables(struct vgic_its *its) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 baser = its->baser_device_table; - struct its_device *dev; - int dte_esz = abi->dte_esz; - - if (!(baser & GITS_BASER_VALID)) - return 0; - - list_sort(NULL, &its->device_list, vgic_its_device_cmp); - - list_for_each_entry(dev, &its->device_list, dev_list) { - int ret; - gpa_t eaddr; - - if (!vgic_its_check_id(its, baser, - dev->device_id, &eaddr)) - return -EINVAL; - - ret = vgic_its_save_itt(its, dev); - if (ret) - return ret; - - ret = vgic_its_save_dte(its, dev, eaddr, dte_esz); - if (ret) - return ret; - } - return 0; -} - -/** - * handle_l1_dte - callback used for L1 device table entries (2 stage case) - * - * @its: its handle - * @id: index of the entry in the L1 table - * @addr: kernel VA - * @opaque: unused - * - * L1 table entries are scanned by steps of 1 entry - * Return < 0 if error, 0 if last dte was found when scanning the L2 - * table, +1 otherwise (meaning next L1 entry must be scanned) - */ -static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, - void *opaque) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - int l2_start_id = id * (SZ_64K / abi->dte_esz); - u64 entry = *(u64 *)addr; - int dte_esz = abi->dte_esz; - gpa_t gpa; - int ret; - - entry = le64_to_cpu(entry); - - if (!(entry & KVM_ITS_L1E_VALID_MASK)) - return 1; - - gpa = entry & KVM_ITS_L1E_ADDR_MASK; - - ret = scan_its_table(its, gpa, SZ_64K, dte_esz, - l2_start_id, vgic_its_restore_dte, NULL); - - return ret; -} - -/** - * vgic_its_restore_device_tables - Restore the device table and all ITT - * from guest RAM to internal data structs - */ -static int vgic_its_restore_device_tables(struct vgic_its *its) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 baser = its->baser_device_table; - int l1_esz, ret; - int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; - gpa_t l1_gpa; - - if (!(baser & GITS_BASER_VALID)) - return 0; - - l1_gpa = GITS_BASER_ADDR_48_to_52(baser); - - if (baser & GITS_BASER_INDIRECT) { - l1_esz = GITS_LVL1_ENTRY_SIZE; - ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, - handle_l1_dte, NULL); - } else { - l1_esz = abi->dte_esz; - ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, - vgic_its_restore_dte, NULL); - } - - /* scan_its_table returns +1 if all entries are invalid */ - if (ret > 0) - ret = 0; - - return ret; -} - -static int vgic_its_save_cte(struct vgic_its *its, - struct its_collection *collection, - gpa_t gpa, int esz) -{ - u64 val; - - val = (1ULL << KVM_ITS_CTE_VALID_SHIFT | - ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | - collection->collection_id); - val = cpu_to_le64(val); - return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); -} - -static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) -{ - struct its_collection *collection; - struct kvm *kvm = its->dev->kvm; - u32 target_addr, coll_id; - u64 val; - int ret; - - BUG_ON(esz > sizeof(val)); - ret = kvm_read_guest_lock(kvm, gpa, &val, esz); - if (ret) - return ret; - val = le64_to_cpu(val); - if (!(val & KVM_ITS_CTE_VALID_MASK)) - return 0; - - target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT); - coll_id = val & KVM_ITS_CTE_ICID_MASK; - - if (target_addr != COLLECTION_NOT_MAPPED && - target_addr >= atomic_read(&kvm->online_vcpus)) - return -EINVAL; - - collection = find_collection(its, coll_id); - if (collection) - return -EEXIST; - ret = vgic_its_alloc_collection(its, &collection, coll_id); - if (ret) - return ret; - collection->target_addr = target_addr; - return 1; -} - -/** - * vgic_its_save_collection_table - Save the collection table into - * guest RAM - */ -static int vgic_its_save_collection_table(struct vgic_its *its) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 baser = its->baser_coll_table; - gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); - struct its_collection *collection; - u64 val; - size_t max_size, filled = 0; - int ret, cte_esz = abi->cte_esz; - - if (!(baser & GITS_BASER_VALID)) - return 0; - - max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; - - list_for_each_entry(collection, &its->collection_list, coll_list) { - ret = vgic_its_save_cte(its, collection, gpa, cte_esz); - if (ret) - return ret; - gpa += cte_esz; - filled += cte_esz; - } - - if (filled == max_size) - return 0; - - /* - * table is not fully filled, add a last dummy element - * with valid bit unset - */ - val = 0; - BUG_ON(cte_esz > sizeof(val)); - ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); - return ret; -} - -/** - * vgic_its_restore_collection_table - reads the collection table - * in guest memory and restores the ITS internal state. Requires the - * BASER registers to be restored before. - */ -static int vgic_its_restore_collection_table(struct vgic_its *its) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 baser = its->baser_coll_table; - int cte_esz = abi->cte_esz; - size_t max_size, read = 0; - gpa_t gpa; - int ret; - - if (!(baser & GITS_BASER_VALID)) - return 0; - - gpa = GITS_BASER_ADDR_48_to_52(baser); - - max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; - - while (read < max_size) { - ret = vgic_its_restore_cte(its, gpa, cte_esz); - if (ret <= 0) - break; - gpa += cte_esz; - read += cte_esz; - } - - if (ret > 0) - return 0; - - return ret; -} - -/** - * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM - * according to v0 ABI - */ -static int vgic_its_save_tables_v0(struct vgic_its *its) -{ - int ret; - - ret = vgic_its_save_device_tables(its); - if (ret) - return ret; - - return vgic_its_save_collection_table(its); -} - -/** - * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM - * to internal data structs according to V0 ABI - * - */ -static int vgic_its_restore_tables_v0(struct vgic_its *its) -{ - int ret; - - ret = vgic_its_restore_collection_table(its); - if (ret) - return ret; - - return vgic_its_restore_device_tables(its); -} - -static int vgic_its_commit_v0(struct vgic_its *its) -{ - const struct vgic_its_abi *abi; - - abi = vgic_its_get_abi(its); - its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK; - its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK; - - its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5) - << GITS_BASER_ENTRY_SIZE_SHIFT); - - its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5) - << GITS_BASER_ENTRY_SIZE_SHIFT); - return 0; -} - -static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its) -{ - /* We need to keep the ABI specific field values */ - its->baser_coll_table &= ~GITS_BASER_VALID; - its->baser_device_table &= ~GITS_BASER_VALID; - its->cbaser = 0; - its->creadr = 0; - its->cwriter = 0; - its->enabled = 0; - vgic_its_free_device_list(kvm, its); - vgic_its_free_collection_list(kvm, its); -} - -static int vgic_its_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_ITS_ADDR_TYPE: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - case KVM_DEV_ARM_ITS_CTRL_RESET: - return 0; - case KVM_DEV_ARM_ITS_SAVE_TABLES: - return 0; - case KVM_DEV_ARM_ITS_RESTORE_TABLES: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: - return vgic_its_has_attr_regs(dev, attr); - } - return -ENXIO; -} - -static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - int ret = 0; - - if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ - return 0; - - mutex_lock(&kvm->lock); - mutex_lock(&its->its_lock); - - if (!lock_all_vcpus(kvm)) { - mutex_unlock(&its->its_lock); - mutex_unlock(&kvm->lock); - return -EBUSY; - } - - switch (attr) { - case KVM_DEV_ARM_ITS_CTRL_RESET: - vgic_its_reset(kvm, its); - break; - case KVM_DEV_ARM_ITS_SAVE_TABLES: - ret = abi->save_tables(its); - break; - case KVM_DEV_ARM_ITS_RESTORE_TABLES: - ret = abi->restore_tables(its); - break; - } - - unlock_all_vcpus(kvm); - mutex_unlock(&its->its_lock); - mutex_unlock(&kvm->lock); - return ret; -} - -static int vgic_its_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - struct vgic_its *its = dev->private; - int ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - unsigned long type = (unsigned long)attr->attr; - u64 addr; - - if (type != KVM_VGIC_ITS_ADDR_TYPE) - return -ENODEV; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, - addr, SZ_64K); - if (ret) - return ret; - - return vgic_register_its_iodev(dev->kvm, its, addr); - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: - return vgic_its_ctrl(dev->kvm, its, attr->attr); - case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_its_attr_regs_access(dev, attr, ®, true); - } - } - return -ENXIO; -} - -static int vgic_its_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - struct vgic_its *its = dev->private; - u64 addr = its->vgic_its_base; - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - unsigned long type = (unsigned long)attr->attr; - - if (type != KVM_VGIC_ITS_ADDR_TYPE) - return -ENODEV; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - } - case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - int ret; - - ret = vgic_its_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - default: - return -ENXIO; - } - - return 0; -} - -static struct kvm_device_ops kvm_arm_vgic_its_ops = { - .name = "kvm-arm-vgic-its", - .create = vgic_its_create, - .destroy = vgic_its_destroy, - .set_attr = vgic_its_set_attr, - .get_attr = vgic_its_get_attr, - .has_attr = vgic_its_has_attr, -}; - -int kvm_vgic_register_its_device(void) -{ - return kvm_register_device_ops(&kvm_arm_vgic_its_ops, - KVM_DEV_TYPE_ARM_VGIC_ITS); -} diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c deleted file mode 100644 index 44419679f91a..000000000000 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ /dev/null @@ -1,741 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * VGIC: KVM DEVICE API - * - * Copyright (C) 2015 ARM Ltd. - * Author: Marc Zyngier - */ -#include -#include -#include -#include -#include -#include "vgic.h" - -/* common helpers */ - -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment) -{ - if (addr & ~kvm_phys_mask(kvm)) - return -E2BIG; - - if (!IS_ALIGNED(addr, alignment)) - return -EINVAL; - - if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) - return -EEXIST; - - return 0; -} - -static int vgic_check_type(struct kvm *kvm, int type_needed) -{ - if (kvm->arch.vgic.vgic_model != type_needed) - return -ENODEV; - else - return 0; -} - -/** - * kvm_vgic_addr - set or get vgic VM base addresses - * @kvm: pointer to the vm struct - * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX - * @addr: pointer to address value - * @write: if true set the address in the VM address space, if false read the - * address - * - * Set or get the vgic base addresses for the distributor and the virtual CPU - * interface in the VM physical address space. These addresses are properties - * of the emulated core/SoC and therefore user space initially knows this - * information. - * Check them for sanity (alignment, double assignment). We can't check for - * overlapping regions in case of a virtual GICv3 here, since we don't know - * the number of VCPUs yet, so we defer this check to map_resources(). - */ -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) -{ - int r = 0; - struct vgic_dist *vgic = &kvm->arch.vgic; - phys_addr_t *addr_ptr, alignment; - u64 undef_value = VGIC_ADDR_UNDEF; - - mutex_lock(&kvm->lock); - switch (type) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); - addr_ptr = &vgic->vgic_dist_base; - alignment = SZ_4K; - break; - case KVM_VGIC_V2_ADDR_TYPE_CPU: - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); - addr_ptr = &vgic->vgic_cpu_base; - alignment = SZ_4K; - break; - case KVM_VGIC_V3_ADDR_TYPE_DIST: - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); - addr_ptr = &vgic->vgic_dist_base; - alignment = SZ_64K; - break; - case KVM_VGIC_V3_ADDR_TYPE_REDIST: { - struct vgic_redist_region *rdreg; - - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); - if (r) - break; - if (write) { - r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); - goto out; - } - rdreg = list_first_entry(&vgic->rd_regions, - struct vgic_redist_region, list); - if (!rdreg) - addr_ptr = &undef_value; - else - addr_ptr = &rdreg->base; - break; - } - case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: - { - struct vgic_redist_region *rdreg; - u8 index; - - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); - if (r) - break; - - index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK; - - if (write) { - gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK; - u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK) - >> KVM_VGIC_V3_RDIST_COUNT_SHIFT; - u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK) - >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT; - - if (!count || flags) - r = -EINVAL; - else - r = vgic_v3_set_redist_base(kvm, index, - base, count); - goto out; - } - - rdreg = vgic_v3_rdist_region_from_index(kvm, index); - if (!rdreg) { - r = -ENOENT; - goto out; - } - - *addr = index; - *addr |= rdreg->base; - *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; - goto out; - } - default: - r = -ENODEV; - } - - if (r) - goto out; - - if (write) { - r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment); - if (!r) - *addr_ptr = *addr; - } else { - *addr = *addr_ptr; - } - -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int vgic_set_common_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int r; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, true); - return (r == -ENODEV) ? -ENXIO : r; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 val; - int ret = 0; - - if (get_user(val, uaddr)) - return -EFAULT; - - /* - * We require: - * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs - * - at most 1024 interrupts - * - a multiple of 32 interrupts - */ - if (val < (VGIC_NR_PRIVATE_IRQS + 32) || - val > VGIC_MAX_RESERVED || - (val & 31)) - return -EINVAL; - - mutex_lock(&dev->kvm->lock); - - if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) - ret = -EBUSY; - else - dev->kvm->arch.vgic.nr_spis = - val - VGIC_NR_PRIVATE_IRQS; - - mutex_unlock(&dev->kvm->lock); - - return ret; - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - mutex_lock(&dev->kvm->lock); - r = vgic_init(dev->kvm); - mutex_unlock(&dev->kvm->lock); - return r; - } - break; - } - } - - return -ENXIO; -} - -static int vgic_get_common_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int r = -ENXIO; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - r = kvm_vgic_addr(dev->kvm, type, &addr, false); - if (r) - return (r == -ENODEV) ? -ENXIO : r; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - - r = put_user(dev->kvm->arch.vgic.nr_spis + - VGIC_NR_PRIVATE_IRQS, uaddr); - break; - } - } - - return r; -} - -static int vgic_create(struct kvm_device *dev, u32 type) -{ - return kvm_vgic_create(dev->kvm, type); -} - -static void vgic_destroy(struct kvm_device *dev) -{ - kfree(dev); -} - -int kvm_register_vgic_device(unsigned long type) -{ - int ret = -ENODEV; - - switch (type) { - case KVM_DEV_TYPE_ARM_VGIC_V2: - ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, - KVM_DEV_TYPE_ARM_VGIC_V2); - break; - case KVM_DEV_TYPE_ARM_VGIC_V3: - ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, - KVM_DEV_TYPE_ARM_VGIC_V3); - - if (ret) - break; - ret = kvm_vgic_register_its_device(); - break; - } - - return ret; -} - -int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr) -{ - int cpuid; - - cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> - KVM_DEV_ARM_VGIC_CPUID_SHIFT; - - if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) - return -EINVAL; - - reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid); - reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - - return 0; -} - -/* unlocks vcpus from @vcpu_lock_idx and smaller */ -static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) -{ - struct kvm_vcpu *tmp_vcpu; - - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); - mutex_unlock(&tmp_vcpu->mutex); - } -} - -void unlock_all_vcpus(struct kvm *kvm) -{ - unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); -} - -/* Returns true if all vcpus were locked, false otherwise */ -bool lock_all_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *tmp_vcpu; - int c; - - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run and fiddle with the vgic state while we - * access it. - */ - kvm_for_each_vcpu(c, tmp_vcpu, kvm) { - if (!mutex_trylock(&tmp_vcpu->mutex)) { - unlock_vcpus(kvm, c - 1); - return false; - } - } - - return true; -} - -/** - * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state - * - * @dev: kvm device handle - * @attr: kvm device attribute - * @reg: address the value is read or written - * @is_write: true if userspace is writing a register - */ -static int vgic_v2_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u32 *reg, bool is_write) -{ - struct vgic_reg_attr reg_attr; - gpa_t addr; - struct kvm_vcpu *vcpu; - int ret; - - ret = vgic_v2_parse_attr(dev, attr, ®_attr); - if (ret) - return ret; - - vcpu = reg_attr.vcpu; - addr = reg_attr.addr; - - mutex_lock(&dev->kvm->lock); - - ret = vgic_init(dev->kvm); - if (ret) - goto out; - - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg); - break; - default: - ret = -EINVAL; - break; - } - - unlock_all_vcpus(dev->kvm); -out: - mutex_unlock(&dev->kvm->lock); - return ret; -} - -static int vgic_v2_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v2_attr_regs_access(dev, attr, ®, true); - } - } - - return -ENXIO; -} - -static int vgic_v2_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg = 0; - - ret = vgic_v2_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - } - - return -ENXIO; -} - -static int vgic_v2_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - case KVM_VGIC_V2_ADDR_TYPE_CPU: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - return vgic_v2_has_attr_regs(dev, attr); - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v2_ops = { - .name = "kvm-arm-vgic-v2", - .create = vgic_create, - .destroy = vgic_destroy, - .set_attr = vgic_v2_set_attr, - .get_attr = vgic_v2_get_attr, - .has_attr = vgic_v2_has_attr, -}; - -int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr) -{ - unsigned long vgic_mpidr, mpidr_reg; - - /* - * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group, - * attr might not hold MPIDR. Hence assume vcpu0. - */ - if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) { - vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >> - KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT; - - mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr); - reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg); - } else { - reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0); - } - - if (!reg_attr->vcpu) - return -EINVAL; - - reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - - return 0; -} - -/* - * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state - * - * @dev: kvm device handle - * @attr: kvm device attribute - * @reg: address the value is read or written - * @is_write: true if userspace is writing a register - */ -static int vgic_v3_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u64 *reg, bool is_write) -{ - struct vgic_reg_attr reg_attr; - gpa_t addr; - struct kvm_vcpu *vcpu; - int ret; - u32 tmp32; - - ret = vgic_v3_parse_attr(dev, attr, ®_attr); - if (ret) - return ret; - - vcpu = reg_attr.vcpu; - addr = reg_attr.addr; - - mutex_lock(&dev->kvm->lock); - - if (unlikely(!vgic_initialized(dev->kvm))) { - ret = -EBUSY; - goto out; - } - - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; - break; - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; - break; - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 regid; - - regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write, - regid, reg); - break; - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - unsigned int info, intid; - - info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> - KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; - if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { - intid = attr->attr & - KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; - ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, - intid, reg); - } else { - ret = -EINVAL; - } - break; - } - default: - ret = -EINVAL; - break; - } - - unlock_all_vcpus(dev->kvm); -out: - mutex_unlock(&dev->kvm->lock); - return ret; -} - -static int vgic_v3_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 tmp32; - u64 reg; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - int ret; - - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: - mutex_lock(&dev->kvm->lock); - - if (!lock_all_vcpus(dev->kvm)) { - mutex_unlock(&dev->kvm->lock); - return -EBUSY; - } - ret = vgic_v3_save_pending_tables(dev->kvm); - unlock_all_vcpus(dev->kvm); - mutex_unlock(&dev->kvm->lock); - return ret; - } - break; - } - } - return -ENXIO; -} - -static int vgic_v3_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); - } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); - } - } - return -ENXIO; -} - -static int vgic_v3_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V3_ADDR_TYPE_DIST: - case KVM_VGIC_V3_ADDR_TYPE_REDIST: - case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: - return vgic_v3_has_attr_regs(dev, attr); - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> - KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) == - VGIC_LEVEL_INFO_LINE_LEVEL) - return 0; - break; - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v3_ops = { - .name = "kvm-arm-vgic-v3", - .create = vgic_create, - .destroy = vgic_destroy, - .set_attr = vgic_v3_set_attr, - .get_attr = vgic_v3_get_attr, - .has_attr = vgic_v3_has_attr, -}; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c deleted file mode 100644 index a016f07adc28..000000000000 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ /dev/null @@ -1,550 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * VGICv2 MMIO handling functions - */ - -#include -#include -#include -#include - -#include -#include - -#include "vgic.h" -#include "vgic-mmio.h" - -/* - * The Revision field in the IIDR have the following meanings: - * - * Revision 1: Report GICv2 interrupts as group 0 instead of group 1 - * Revision 2: Interrupt groups are guest-configurable and signaled using - * their configured groups. - */ - -static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; - u32 value; - - switch (addr & 0x0c) { - case GIC_DIST_CTRL: - value = vgic->enabled ? GICD_ENABLE : 0; - break; - case GIC_DIST_CTR: - value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; - value = (value >> 5) - 1; - value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; - break; - case GIC_DIST_IIDR: - value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | - (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | - (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); - break; - default: - return 0; - } - - return value; -} - -static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - bool was_enabled = dist->enabled; - - switch (addr & 0x0c) { - case GIC_DIST_CTRL: - dist->enabled = val & GICD_ENABLE; - if (!was_enabled && dist->enabled) - vgic_kick_vcpus(vcpu->kvm); - break; - case GIC_DIST_CTR: - case GIC_DIST_IIDR: - /* Nothing to do */ - return; - } -} - -static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - switch (addr & 0x0c) { - case GIC_DIST_IIDR: - if (val != vgic_mmio_read_v2_misc(vcpu, addr, len)) - return -EINVAL; - - /* - * If we observe a write to GICD_IIDR we know that userspace - * has been updated and has had a chance to cope with older - * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting - * interrupts as group 1, and therefore we now allow groups to - * be user writable. Doing this by default would break - * migration from old kernels to new kernels with legacy - * userspace. - */ - vcpu->kvm->arch.vgic.v2_groups_user_writable = true; - return 0; - } - - vgic_mmio_write_v2_misc(vcpu, addr, len, val); - return 0; -} - -static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - if (vcpu->kvm->arch.vgic.v2_groups_user_writable) - vgic_mmio_write_group(vcpu, addr, len, val); - - return 0; -} - -static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus); - int intid = val & 0xf; - int targets = (val >> 16) & 0xff; - int mode = (val >> 24) & 0x03; - int c; - struct kvm_vcpu *vcpu; - unsigned long flags; - - switch (mode) { - case 0x0: /* as specified by targets */ - break; - case 0x1: - targets = (1U << nr_vcpus) - 1; /* all, ... */ - targets &= ~(1U << source_vcpu->vcpu_id); /* but self */ - break; - case 0x2: /* this very vCPU only */ - targets = (1U << source_vcpu->vcpu_id); - break; - case 0x3: /* reserved */ - return; - } - - kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) { - struct vgic_irq *irq; - - if (!(targets & (1U << c))) - continue; - - irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - irq->source |= 1U << source_vcpu->vcpu_id; - - vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags); - vgic_put_irq(source_vcpu->kvm, irq); - } -} - -static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - u64 val = 0; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - val |= (u64)irq->targets << (i * 8); - - vgic_put_irq(vcpu->kvm, irq); - } - - return val; -} - -static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0); - int i; - unsigned long flags; - - /* GICD_ITARGETSR[0-7] are read-only */ - if (intid < VGIC_NR_PRIVATE_IRQS) - return; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); - int target; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - irq->targets = (val >> (i * 8)) & cpu_mask; - target = irq->targets ? __ffs(irq->targets) : 0; - irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = addr & 0x0f; - int i; - u64 val = 0; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - val |= (u64)irq->source << (i * 8); - - vgic_put_irq(vcpu->kvm, irq); - } - return val; -} - -static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = addr & 0x0f; - int i; - unsigned long flags; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - irq->source &= ~((val >> (i * 8)) & 0xff); - if (!irq->source) - irq->pending_latch = false; - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = addr & 0x0f; - int i; - unsigned long flags; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - irq->source |= (val >> (i * 8)) & 0xff; - - if (irq->source) { - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } else { - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - } - vgic_put_irq(vcpu->kvm, irq); - } -} - -#define GICC_ARCH_VERSION_V2 0x2 - -/* These are for userland accesses only, there is no guest-facing emulation. */ -static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_vmcr vmcr; - u32 val; - - vgic_get_vmcr(vcpu, &vmcr); - - switch (addr & 0xff) { - case GIC_CPU_CTRL: - val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT; - val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT; - val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT; - val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT; - val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT; - val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT; - - break; - case GIC_CPU_PRIMASK: - /* - * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than - * GICC_PMR.Priority, so we expose the upper five bits of - * priority mask to userspace using the lower bits in the - * unsigned long. - */ - val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >> - GICV_PMR_PRIORITY_SHIFT; - break; - case GIC_CPU_BINPOINT: - val = vmcr.bpr; - break; - case GIC_CPU_ALIAS_BINPOINT: - val = vmcr.abpr; - break; - case GIC_CPU_IDENT: - val = ((PRODUCT_ID_KVM << 20) | - (GICC_ARCH_VERSION_V2 << 16) | - IMPLEMENTER_ARM); - break; - default: - return 0; - } - - return val; -} - -static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_vmcr vmcr; - - vgic_get_vmcr(vcpu, &vmcr); - - switch (addr & 0xff) { - case GIC_CPU_CTRL: - vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0); - vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1); - vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl); - vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn); - vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR); - vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS); - - break; - case GIC_CPU_PRIMASK: - /* - * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than - * GICC_PMR.Priority, so we expose the upper five bits of - * priority mask to userspace using the lower bits in the - * unsigned long. - */ - vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) & - GICV_PMR_PRIORITY_MASK; - break; - case GIC_CPU_BINPOINT: - vmcr.bpr = val; - break; - case GIC_CPU_ALIAS_BINPOINT: - vmcr.abpr = val; - break; - } - - vgic_set_vmcr(vcpu, &vmcr); -} - -static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - int n; /* which APRn is this */ - - n = (addr >> 2) & 0x3; - - if (kvm_vgic_global_state.type == VGIC_V2) { - /* GICv2 hardware systems support max. 32 groups */ - if (n != 0) - return 0; - return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr; - } else { - struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - - if (n > vgic_v3_max_apr_idx(vcpu)) - return 0; - - n = array_index_nospec(n, 4); - - /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ - return vgicv3->vgic_ap1r[n]; - } -} - -static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - int n; /* which APRn is this */ - - n = (addr >> 2) & 0x3; - - if (kvm_vgic_global_state.type == VGIC_V2) { - /* GICv2 hardware systems support max. 32 groups */ - if (n != 0) - return; - vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val; - } else { - struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - - if (n > vgic_v3_max_apr_idx(vcpu)) - return; - - n = array_index_nospec(n, 4); - - /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ - vgicv3->vgic_ap1r[n] = val; - } -} - -static const struct vgic_register_region vgic_v2_dist_registers[] = { - REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL, - vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, - NULL, vgic_mmio_uaccess_write_v2_misc, - 12, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP, - vgic_mmio_read_group, vgic_mmio_write_group, - NULL, vgic_mmio_uaccess_write_v2_group, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, - vgic_mmio_read_enable, vgic_mmio_write_senable, - NULL, vgic_uaccess_write_senable, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR, - vgic_mmio_read_enable, vgic_mmio_write_cenable, - NULL, vgic_uaccess_write_cenable, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, - vgic_mmio_read_pending, vgic_mmio_write_spending, - NULL, vgic_uaccess_write_spending, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, - vgic_mmio_read_pending, vgic_mmio_write_cpending, - NULL, vgic_uaccess_write_cpending, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, - vgic_mmio_read_active, vgic_mmio_write_sactive, - vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR, - vgic_mmio_read_active, vgic_mmio_write_cactive, - vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI, - vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, - 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET, - vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG, - vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT, - vgic_mmio_read_raz, vgic_mmio_write_sgir, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR, - vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET, - vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), -}; - -static const struct vgic_register_region vgic_v2_cpu_registers[] = { - REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO, - vgic_mmio_read_apr, vgic_mmio_write_apr, 16, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), -}; - -unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) -{ - dev->regions = vgic_v2_dist_registers; - dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); - - kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); - - return SZ_4K; -} - -int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) -{ - const struct vgic_register_region *region; - struct vgic_io_device iodev; - struct vgic_reg_attr reg_attr; - struct kvm_vcpu *vcpu; - gpa_t addr; - int ret; - - ret = vgic_v2_parse_attr(dev, attr, ®_attr); - if (ret) - return ret; - - vcpu = reg_attr.vcpu; - addr = reg_attr.addr; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - iodev.regions = vgic_v2_dist_registers; - iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); - iodev.base_addr = 0; - break; - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - iodev.regions = vgic_v2_cpu_registers; - iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); - iodev.base_addr = 0; - break; - default: - return -ENXIO; - } - - /* We only support aligned 32-bit accesses. */ - if (addr & 3) - return -ENXIO; - - region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); - if (!region) - return -ENXIO; - - return 0; -} - -int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device dev = { - .regions = vgic_v2_cpu_registers, - .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers), - .iodev_type = IODEV_CPUIF, - }; - - return vgic_uaccess(vcpu, &dev, is_write, offset, val); -} - -int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device dev = { - .regions = vgic_v2_dist_registers, - .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers), - .iodev_type = IODEV_DIST, - }; - - return vgic_uaccess(vcpu, &dev, is_write, offset, val); -} diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c deleted file mode 100644 index 89a14ec8b33b..000000000000 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ /dev/null @@ -1,1063 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * VGICv3 MMIO handling functions - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "vgic.h" -#include "vgic-mmio.h" - -/* extract @num bytes at @offset bytes offset in data */ -unsigned long extract_bytes(u64 data, unsigned int offset, - unsigned int num) -{ - return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); -} - -/* allows updates of any half of a 64-bit register (or the whole thing) */ -u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, - unsigned long val) -{ - int lower = (offset & 4) * 8; - int upper = lower + 8 * len - 1; - - reg &= ~GENMASK_ULL(upper, lower); - val &= GENMASK_ULL(len * 8 - 1, 0); - - return reg | ((u64)val << lower); -} - -bool vgic_has_its(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) - return false; - - return dist->has_its; -} - -bool vgic_supports_direct_msis(struct kvm *kvm) -{ - return (kvm_vgic_global_state.has_gicv4_1 || - (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm))); -} - -/* - * The Revision field in the IIDR have the following meanings: - * - * Revision 2: Interrupt groups are guest-configurable and signaled using - * their configured groups. - */ - -static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; - u32 value = 0; - - switch (addr & 0x0c) { - case GICD_CTLR: - if (vgic->enabled) - value |= GICD_CTLR_ENABLE_SS_G1; - value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; - if (vgic->nassgireq) - value |= GICD_CTLR_nASSGIreq; - break; - case GICD_TYPER: - value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; - value = (value >> 5) - 1; - if (vgic_has_its(vcpu->kvm)) { - value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; - value |= GICD_TYPER_LPIS; - } else { - value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; - } - break; - case GICD_TYPER2: - if (kvm_vgic_global_state.has_gicv4_1) - value = GICD_TYPER2_nASSGIcap; - break; - case GICD_IIDR: - value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | - (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | - (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); - break; - default: - return 0; - } - - return value; -} - -static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - switch (addr & 0x0c) { - case GICD_CTLR: { - bool was_enabled, is_hwsgi; - - mutex_lock(&vcpu->kvm->lock); - - was_enabled = dist->enabled; - is_hwsgi = dist->nassgireq; - - dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; - - /* Not a GICv4.1? No HW SGIs */ - if (!kvm_vgic_global_state.has_gicv4_1) - val &= ~GICD_CTLR_nASSGIreq; - - /* Dist stays enabled? nASSGIreq is RO */ - if (was_enabled && dist->enabled) { - val &= ~GICD_CTLR_nASSGIreq; - val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi); - } - - /* Switching HW SGIs? */ - dist->nassgireq = val & GICD_CTLR_nASSGIreq; - if (is_hwsgi != dist->nassgireq) - vgic_v4_configure_vsgis(vcpu->kvm); - - if (kvm_vgic_global_state.has_gicv4_1 && - was_enabled != dist->enabled) - kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4); - else if (!was_enabled && dist->enabled) - vgic_kick_vcpus(vcpu->kvm); - - mutex_unlock(&vcpu->kvm->lock); - break; - } - case GICD_TYPER: - case GICD_TYPER2: - case GICD_IIDR: - /* This is at best for documentation purposes... */ - return; - } -} - -static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - switch (addr & 0x0c) { - case GICD_TYPER2: - case GICD_IIDR: - if (val != vgic_mmio_read_v3_misc(vcpu, addr, len)) - return -EINVAL; - return 0; - case GICD_CTLR: - /* Not a GICv4.1? No HW SGIs */ - if (!kvm_vgic_global_state.has_gicv4_1) - val &= ~GICD_CTLR_nASSGIreq; - - dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; - dist->nassgireq = val & GICD_CTLR_nASSGIreq; - return 0; - } - - vgic_mmio_write_v3_misc(vcpu, addr, len, val); - return 0; -} - -static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); - unsigned long ret = 0; - - if (!irq) - return 0; - - /* The upper word is RAZ for us. */ - if (!(addr & 4)) - ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); - - vgic_put_irq(vcpu->kvm, irq); - return ret; -} - -static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq; - unsigned long flags; - - /* The upper word is WI for us since we don't implement Aff3. */ - if (addr & 4) - return; - - irq = vgic_get_irq(vcpu->kvm, NULL, intid); - - if (!irq) - return; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - /* We only care about and preserve Aff0, Aff1 and Aff2. */ - irq->mpidr = val & GENMASK(23, 0); - irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); -} - -static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0; -} - - -static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - bool was_enabled = vgic_cpu->lpis_enabled; - - if (!vgic_has_its(vcpu->kvm)) - return; - - vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; - - if (was_enabled && !vgic_cpu->lpis_enabled) { - vgic_flush_pending_lpis(vcpu); - vgic_its_invalidate_cache(vcpu->kvm); - } - - if (!was_enabled && vgic_cpu->lpis_enabled) - vgic_enable_lpis(vcpu); -} - -static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_redist_region *rdreg = vgic_cpu->rdreg; - int target_vcpu_id = vcpu->vcpu_id; - gpa_t last_rdist_typer = rdreg->base + GICR_TYPER + - (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE; - u64 value; - - value = (u64)(mpidr & GENMASK(23, 0)) << 32; - value |= ((target_vcpu_id & 0xffff) << 8); - - if (addr == last_rdist_typer) - value |= GICR_TYPER_LAST; - if (vgic_has_its(vcpu->kvm)) - value |= GICR_TYPER_PLPIS; - - return extract_bytes(value, addr & 7, len); -} - -static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); -} - -static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - switch (addr & 0xffff) { - case GICD_PIDR2: - /* report a GICv3 compliant implementation */ - return 0x3b; - } - - return 0; -} - -static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* - * pending state of interrupt is latched in pending_latch variable. - * Userspace will save and restore pending state and line_level - * separately. - * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt - * for handling of ISPENDR and ICPENDR. - */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - bool state = irq->pending_latch; - - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - int err; - - err = irq_get_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - &state); - WARN_ON(err); - } - - if (state) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - if (test_bit(i, &val)) { - /* - * pending_latch is set irrespective of irq type - * (level or edge) to avoid dependency that VM should - * restore irq config before pending info. - */ - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } else { - irq->pending_latch = false; - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - } - - vgic_put_irq(vcpu->kvm, irq); - } - - return 0; -} - -/* We want to avoid outer shareable. */ -u64 vgic_sanitise_shareability(u64 field) -{ - switch (field) { - case GIC_BASER_OuterShareable: - return GIC_BASER_InnerShareable; - default: - return field; - } -} - -/* Avoid any inner non-cacheable mapping. */ -u64 vgic_sanitise_inner_cacheability(u64 field) -{ - switch (field) { - case GIC_BASER_CACHE_nCnB: - case GIC_BASER_CACHE_nC: - return GIC_BASER_CACHE_RaWb; - default: - return field; - } -} - -/* Non-cacheable or same-as-inner are OK. */ -u64 vgic_sanitise_outer_cacheability(u64 field) -{ - switch (field) { - case GIC_BASER_CACHE_SameAsInner: - case GIC_BASER_CACHE_nC: - return field; - default: - return GIC_BASER_CACHE_nC; - } -} - -u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, - u64 (*sanitise_fn)(u64)) -{ - u64 field = (reg & field_mask) >> field_shift; - - field = sanitise_fn(field) << field_shift; - return (reg & ~field_mask) | field; -} - -#define PROPBASER_RES0_MASK \ - (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5)) -#define PENDBASER_RES0_MASK \ - (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \ - GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0)) - -static u64 vgic_sanitise_pendbaser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK, - GICR_PENDBASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK, - GICR_PENDBASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK, - GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - reg &= ~PENDBASER_RES0_MASK; - - return reg; -} - -static u64 vgic_sanitise_propbaser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK, - GICR_PROPBASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK, - GICR_PROPBASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK, - GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - reg &= ~PROPBASER_RES0_MASK; - return reg; -} - -static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return extract_bytes(dist->propbaser, addr & 7, len); -} - -static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u64 old_propbaser, propbaser; - - /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) - return; - - do { - old_propbaser = READ_ONCE(dist->propbaser); - propbaser = old_propbaser; - propbaser = update_64bit_reg(propbaser, addr & 4, len, val); - propbaser = vgic_sanitise_propbaser(propbaser); - } while (cmpxchg64(&dist->propbaser, old_propbaser, - propbaser) != old_propbaser); -} - -static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u64 value = vgic_cpu->pendbaser; - - value &= ~GICR_PENDBASER_PTZ; - - return extract_bytes(value, addr & 7, len); -} - -static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u64 old_pendbaser, pendbaser; - - /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) - return; - - do { - old_pendbaser = READ_ONCE(vgic_cpu->pendbaser); - pendbaser = old_pendbaser; - pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); - pendbaser = vgic_sanitise_pendbaser(pendbaser); - } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser, - pendbaser) != old_pendbaser); -} - -/* - * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the - * redistributors, while SPIs are covered by registers in the distributor - * block. Trying to set private IRQs in this block gets ignored. - * We take some special care here to fix the calculation of the register - * offset. - */ -#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = bpi, \ - .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ - .access_flags = acc, \ - .read = vgic_mmio_read_raz, \ - .write = vgic_mmio_write_wi, \ - }, { \ - .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ - .bits_per_irq = bpi, \ - .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - .uaccess_read = ur, \ - .uaccess_write = uw, \ - } - -static const struct vgic_register_region vgic_v3_dist_registers[] = { - REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR, - vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, - NULL, vgic_mmio_uaccess_write_v3_misc, - 16, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICD_STATUSR, - vgic_mmio_read_rao, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, - vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, - vgic_mmio_read_enable, vgic_mmio_write_senable, - NULL, vgic_uaccess_write_senable, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, - vgic_mmio_read_enable, vgic_mmio_write_cenable, - NULL, vgic_uaccess_write_cenable, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, - vgic_mmio_read_pending, vgic_mmio_write_spending, - vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, - vgic_mmio_read_pending, vgic_mmio_write_cpending, - vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, - vgic_mmio_read_active, vgic_mmio_write_sactive, - vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, - vgic_mmio_read_active, vgic_mmio_write_cactive, - vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, - 1, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, - vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, - 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, - vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, - vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, - vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, - vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, - vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, - VGIC_ACCESS_32bit), -}; - -static const struct vgic_register_region vgic_v3_rd_registers[] = { - /* RD_base registers */ - REGISTER_DESC_WITH_LENGTH(GICR_CTLR, - vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_STATUSR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IIDR, - vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_TYPER, - vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_WAKER, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, - vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, - vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, - vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, - VGIC_ACCESS_32bit), - /* SGI_base registers */ - REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0, - vgic_mmio_read_group, vgic_mmio_write_group, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0, - vgic_mmio_read_enable, vgic_mmio_write_senable, - NULL, vgic_uaccess_write_senable, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0, - vgic_mmio_read_enable, vgic_mmio_write_cenable, - NULL, vgic_uaccess_write_cenable, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0, - vgic_mmio_read_pending, vgic_mmio_write_spending, - vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0, - vgic_mmio_read_pending, vgic_mmio_write_cpending, - vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0, - vgic_mmio_read_active, vgic_mmio_write_sactive, - vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0, - vgic_mmio_read_active, vgic_mmio_write_cactive, - vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0, - vgic_mmio_read_priority, vgic_mmio_write_priority, 32, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0, - vgic_mmio_read_config, vgic_mmio_write_config, 8, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), -}; - -unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) -{ - dev->regions = vgic_v3_dist_registers; - dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); - - kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); - - return SZ_64K; -} - -/** - * vgic_register_redist_iodev - register a single redist iodev - * @vcpu: The VCPU to which the redistributor belongs - * - * Register a KVM iodev for this VCPU's redistributor using the address - * provided. - * - * Return 0 on success, -ERRNO otherwise. - */ -int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - struct vgic_dist *vgic = &kvm->arch.vgic; - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; - struct vgic_redist_region *rdreg; - gpa_t rd_base; - int ret; - - if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) - return 0; - - /* - * We may be creating VCPUs before having set the base address for the - * redistributor region, in which case we will come back to this - * function for all VCPUs when the base address is set. Just return - * without doing any work for now. - */ - rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions); - if (!rdreg) - return 0; - - if (!vgic_v3_check_base(kvm)) - return -EINVAL; - - vgic_cpu->rdreg = rdreg; - - rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE; - - kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); - rd_dev->base_addr = rd_base; - rd_dev->iodev_type = IODEV_REDIST; - rd_dev->regions = vgic_v3_rd_registers; - rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); - rd_dev->redist_vcpu = vcpu; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, - 2 * SZ_64K, &rd_dev->dev); - mutex_unlock(&kvm->slots_lock); - - if (ret) - return ret; - - rdreg->free_index++; - return 0; -} - -static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) -{ - struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; - - kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev); -} - -static int vgic_register_all_redist_iodevs(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int c, ret = 0; - - kvm_for_each_vcpu(c, vcpu, kvm) { - ret = vgic_register_redist_iodev(vcpu); - if (ret) - break; - } - - if (ret) { - /* The current c failed, so we start with the previous one. */ - mutex_lock(&kvm->slots_lock); - for (c--; c >= 0; c--) { - vcpu = kvm_get_vcpu(kvm, c); - vgic_unregister_redist_iodev(vcpu); - } - mutex_unlock(&kvm->slots_lock); - } - - return ret; -} - -/** - * vgic_v3_insert_redist_region - Insert a new redistributor region - * - * Performs various checks before inserting the rdist region in the list. - * Those tests depend on whether the size of the rdist region is known - * (ie. count != 0). The list is sorted by rdist region index. - * - * @kvm: kvm handle - * @index: redist region index - * @base: base of the new rdist region - * @count: number of redistributors the region is made of (0 in the old style - * single region, whose size is induced from the number of vcpus) - * - * Return 0 on success, < 0 otherwise - */ -static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index, - gpa_t base, uint32_t count) -{ - struct vgic_dist *d = &kvm->arch.vgic; - struct vgic_redist_region *rdreg; - struct list_head *rd_regions = &d->rd_regions; - size_t size = count * KVM_VGIC_V3_REDIST_SIZE; - int ret; - - /* single rdist region already set ?*/ - if (!count && !list_empty(rd_regions)) - return -EINVAL; - - /* cross the end of memory ? */ - if (base + size < base) - return -EINVAL; - - if (list_empty(rd_regions)) { - if (index != 0) - return -EINVAL; - } else { - rdreg = list_last_entry(rd_regions, - struct vgic_redist_region, list); - if (index != rdreg->index + 1) - return -EINVAL; - - /* Cannot add an explicitly sized regions after legacy region */ - if (!rdreg->count) - return -EINVAL; - } - - /* - * For legacy single-region redistributor regions (!count), - * check that the redistributor region does not overlap with the - * distributor's address space. - */ - if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && - vgic_dist_overlap(kvm, base, size)) - return -EINVAL; - - /* collision with any other rdist region? */ - if (vgic_v3_rdist_overlap(kvm, base, size)) - return -EINVAL; - - rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL); - if (!rdreg) - return -ENOMEM; - - rdreg->base = VGIC_ADDR_UNDEF; - - ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K); - if (ret) - goto free; - - rdreg->base = base; - rdreg->count = count; - rdreg->free_index = 0; - rdreg->index = index; - - list_add_tail(&rdreg->list, rd_regions); - return 0; -free: - kfree(rdreg); - return ret; -} - -int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) -{ - int ret; - - ret = vgic_v3_insert_redist_region(kvm, index, addr, count); - if (ret) - return ret; - - /* - * Register iodevs for each existing VCPU. Adding more VCPUs - * afterwards will register the iodevs when needed. - */ - ret = vgic_register_all_redist_iodevs(kvm); - if (ret) - return ret; - - return 0; -} - -int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) -{ - const struct vgic_register_region *region; - struct vgic_io_device iodev; - struct vgic_reg_attr reg_attr; - struct kvm_vcpu *vcpu; - gpa_t addr; - int ret; - - ret = vgic_v3_parse_attr(dev, attr, ®_attr); - if (ret) - return ret; - - vcpu = reg_attr.vcpu; - addr = reg_attr.addr; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - iodev.regions = vgic_v3_dist_registers; - iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); - iodev.base_addr = 0; - break; - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{ - iodev.regions = vgic_v3_rd_registers; - iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); - iodev.base_addr = 0; - break; - } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 reg, id; - - id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, ®); - } - default: - return -ENXIO; - } - - /* We only support aligned 32-bit accesses. */ - if (addr & 3) - return -ENXIO; - - region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); - if (!region) - return -ENXIO; - - return 0; -} -/* - * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI - * generation register ICC_SGI1R_EL1) with a given VCPU. - * If the VCPU's MPIDR matches, return the level0 affinity, otherwise - * return -1. - */ -static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) -{ - unsigned long affinity; - int level0; - - /* - * Split the current VCPU's MPIDR into affinity level 0 and the - * rest as this is what we have to compare against. - */ - affinity = kvm_vcpu_get_mpidr_aff(vcpu); - level0 = MPIDR_AFFINITY_LEVEL(affinity, 0); - affinity &= ~MPIDR_LEVEL_MASK; - - /* bail out if the upper three levels don't match */ - if (sgi_aff != affinity) - return -1; - - /* Is this VCPU's bit set in the mask ? */ - if (!(sgi_cpu_mask & BIT(level0))) - return -1; - - return level0; -} - -/* - * The ICC_SGI* registers encode the affinity differently from the MPIDR, - * so provide a wrapper to use the existing defines to isolate a certain - * affinity level. - */ -#define SGI_AFFINITY_LEVEL(reg, level) \ - ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ - >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) - -/** - * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs - * @vcpu: The VCPU requesting a SGI - * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU - * @allow_group1: Does the sysreg access allow generation of G1 SGIs - * - * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. - * This will trap in sys_regs.c and call this function. - * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the - * target processors as well as a bitmask of 16 Aff0 CPUs. - * If the interrupt routing mode bit is not set, we iterate over all VCPUs to - * check for matching ones. If this bit is set, we signal all, but not the - * calling VCPU. - */ -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *c_vcpu; - u16 target_cpus; - u64 mpidr; - int sgi, c; - int vcpu_id = vcpu->vcpu_id; - bool broadcast; - unsigned long flags; - - sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; - broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); - target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; - mpidr = SGI_AFFINITY_LEVEL(reg, 3); - mpidr |= SGI_AFFINITY_LEVEL(reg, 2); - mpidr |= SGI_AFFINITY_LEVEL(reg, 1); - - /* - * We iterate over all VCPUs to find the MPIDRs matching the request. - * If we have handled one CPU, we clear its bit to detect early - * if we are already finished. This avoids iterating through all - * VCPUs when most of the times we just signal a single VCPU. - */ - kvm_for_each_vcpu(c, c_vcpu, kvm) { - struct vgic_irq *irq; - - /* Exit early if we have dealt with all requested CPUs */ - if (!broadcast && target_cpus == 0) - break; - - /* Don't signal the calling VCPU */ - if (broadcast && c == vcpu_id) - continue; - - if (!broadcast) { - int level0; - - level0 = match_mpidr(mpidr, target_cpus, c_vcpu); - if (level0 == -1) - continue; - - /* remove this matching VCPU from the mask */ - target_cpus &= ~BIT(level0); - } - - irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - /* - * An access targetting Group0 SGIs can only generate - * those, while an access targetting Group1 SGIs can - * generate interrupts of either group. - */ - if (!irq->group || allow_group1) { - if (!irq->hw) { - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } else { - /* HW SGI? Ask the GIC to inject it */ - int err; - err = irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - true); - WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - } - } else { - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - } - - vgic_put_irq(vcpu->kvm, irq); - } -} - -int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device dev = { - .regions = vgic_v3_dist_registers, - .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers), - }; - - return vgic_uaccess(vcpu, &dev, is_write, offset, val); -} - -int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device rd_dev = { - .regions = vgic_v3_rd_registers, - .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers), - }; - - return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); -} - -int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val) -{ - if (intid % 32) - return -EINVAL; - - if (is_write) - vgic_write_irq_line_level_info(vcpu, intid, *val); - else - *val = vgic_read_irq_line_level_info(vcpu, intid); - - return 0; -} diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c deleted file mode 100644 index b2d73fc0d1ef..000000000000 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ /dev/null @@ -1,1088 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * VGIC MMIO handling functions - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "vgic.h" -#include "vgic-mmio.h" - -unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return 0; -} - -unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return -1UL; -} - -void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val) -{ - /* Ignore */ -} - -int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val) -{ - /* Ignore */ - return 0; -} - -unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->group) - value |= BIT(i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -static void vgic_update_vsgi(struct vgic_irq *irq) -{ - WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group)); -} - -void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->group = !!(val & BIT(i)); - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - vgic_update_vsgi(irq); - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - } else { - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } - - vgic_put_irq(vcpu->kvm, irq); - } -} - -/* - * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value - * of the enabled bit, so there is only one function for both here. - */ -unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->enabled) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - if (!irq->enabled) { - struct irq_data *data; - - irq->enabled = true; - data = &irq_to_desc(irq->host_irq)->irq_data; - while (irqd_irq_disabled(data)) - enable_irq(irq->host_irq); - } - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - - continue; - } else if (vgic_irq_is_mapped_level(irq)) { - bool was_high = irq->line_level; - - /* - * We need to update the state of the interrupt because - * the guest might have changed the state of the device - * while the interrupt was disabled at the VGIC level. - */ - irq->line_level = vgic_get_phys_line_level(irq); - /* - * Deactivate the physical interrupt so the GIC will let - * us know when it is asserted again. - */ - if (!irq->active && was_high && !irq->line_level) - vgic_irq_set_phys_active(irq, false); - } - irq->enabled = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - - vgic_put_irq(vcpu->kvm, irq); - } -} - -void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled) - disable_irq_nosync(irq->host_irq); - - irq->enabled = false; - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->enabled = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - - return 0; -} - -int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->enabled = false; - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - - return 0; -} - -unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - unsigned long flags; - bool val; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - int err; - - val = false; - err = irq_get_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - &val); - WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); - } else { - val = irq_is_pending(irq); - } - - value |= ((u32)val << i); - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) -{ - return (vgic_irq_is_sgi(irq->intid) && - vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2); -} - -void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - /* GICD_ISPENDR0 SGI bits are WI */ - if (is_vgic_v2_sgi(vcpu, irq)) { - vgic_put_irq(vcpu->kvm, irq); - continue; - } - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - /* HW SGI? Ask the GIC to inject it */ - int err; - err = irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - true); - WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - - continue; - } - - irq->pending_latch = true; - if (irq->hw) - vgic_irq_set_phys_active(irq, true); - - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - - /* - * GICv2 SGIs are terribly broken. We can't restore - * the source of the interrupt, so just pick the vcpu - * itself as the source... - */ - if (is_vgic_v2_sgi(vcpu, irq)) - irq->source |= BIT(vcpu->vcpu_id); - - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - - return 0; -} - -/* Must be called with irq->irq_lock held */ -static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq) -{ - irq->pending_latch = false; - - /* - * We don't want the guest to effectively mask the physical - * interrupt by doing a write to SPENDR followed by a write to - * CPENDR for HW interrupts, so we clear the active state on - * the physical side if the virtual interrupt is not active. - * This may lead to taking an additional interrupt on the - * host, but that should not be a problem as the worst that - * can happen is an additional vgic injection. We also clear - * the pending state to maintain proper semantics for edge HW - * interrupts. - */ - vgic_irq_set_phys_pending(irq, false); - if (!irq->active) - vgic_irq_set_phys_active(irq, false); -} - -void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - /* GICD_ICPENDR0 SGI bits are WI */ - if (is_vgic_v2_sgi(vcpu, irq)) { - vgic_put_irq(vcpu->kvm, irq); - continue; - } - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - /* HW SGI? Ask the GIC to clear its pending bit */ - int err; - err = irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - false); - WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - - continue; - } - - if (irq->hw) - vgic_hw_irq_cpending(vcpu, irq); - else - irq->pending_latch = false; - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - /* - * More fun with GICv2 SGIs! If we're clearing one of them - * from userspace, which source vcpu to clear? Let's not - * even think of it, and blow the whole set. - */ - if (is_vgic_v2_sgi(vcpu, irq)) - irq->source = 0; - - irq->pending_latch = false; - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - - return 0; -} - -/* - * If we are fiddling with an IRQ's active state, we have to make sure the IRQ - * is not queued on some running VCPU's LRs, because then the change to the - * active state can be overwritten when the VCPU's state is synced coming back - * from the guest. - * - * For shared interrupts as well as GICv3 private interrupts, we have to - * stop all the VCPUs because interrupts can be migrated while we don't hold - * the IRQ locks and we don't want to be chasing moving targets. - * - * For GICv2 private interrupts we don't have to do anything because - * userspace accesses to the VGIC state already require all VCPUs to be - * stopped, and only the VCPU itself can modify its private interrupts - * active state, which guarantees that the VCPU is not running. - */ -static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) -{ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || - intid >= VGIC_NR_PRIVATE_IRQS) - kvm_arm_halt_guest(vcpu->kvm); -} - -/* See vgic_access_active_prepare */ -static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid) -{ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || - intid >= VGIC_NR_PRIVATE_IRQS) - kvm_arm_resume_guest(vcpu->kvm); -} - -static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - /* - * Even for HW interrupts, don't evaluate the HW state as - * all the guest is interested in is the virtual state. - */ - if (irq->active) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 val; - - mutex_lock(&vcpu->kvm->lock); - vgic_access_active_prepare(vcpu, intid); - - val = __vgic_mmio_read_active(vcpu, addr, len); - - vgic_access_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); - - return val; -} - -unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return __vgic_mmio_read_active(vcpu, addr, len); -} - -/* Must be called with irq->irq_lock held */ -static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool active, bool is_uaccess) -{ - if (is_uaccess) - return; - - irq->active = active; - vgic_irq_set_phys_active(irq, active); -} - -static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool active) -{ - unsigned long flags; - struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu(); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - if (irq->hw && !vgic_irq_is_sgi(irq->intid)) { - vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); - } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - /* - * GICv4.1 VSGI feature doesn't track an active state, - * so let's not kid ourselves, there is nothing we can - * do here. - */ - irq->active = false; - } else { - u32 model = vcpu->kvm->arch.vgic.vgic_model; - u8 active_source; - - irq->active = active; - - /* - * The GICv2 architecture indicates that the source CPUID for - * an SGI should be provided during an EOI which implies that - * the active state is stored somewhere, but at the same time - * this state is not architecturally exposed anywhere and we - * have no way of knowing the right source. - * - * This may lead to a VCPU not being able to receive - * additional instances of a particular SGI after migration - * for a GICv2 VM on some GIC implementations. Oh well. - */ - active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && - active && vgic_irq_is_sgi(irq->intid)) - irq->active_source = active_source; - } - - if (irq->active) - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - else - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); -} - -static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - vgic_mmio_change_active(vcpu, irq, false); - vgic_put_irq(vcpu->kvm, irq); - } -} - -void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - - mutex_lock(&vcpu->kvm->lock); - vgic_access_active_prepare(vcpu, intid); - - __vgic_mmio_write_cactive(vcpu, addr, len, val); - - vgic_access_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); -} - -int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - __vgic_mmio_write_cactive(vcpu, addr, len, val); - return 0; -} - -static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - vgic_mmio_change_active(vcpu, irq, true); - vgic_put_irq(vcpu->kvm, irq); - } -} - -void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - - mutex_lock(&vcpu->kvm->lock); - vgic_access_active_prepare(vcpu, intid); - - __vgic_mmio_write_sactive(vcpu, addr, len, val); - - vgic_access_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); -} - -int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - __vgic_mmio_write_sactive(vcpu, addr, len, val); - return 0; -} - -unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - u64 val = 0; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - val |= (u64)irq->priority << (i * 8); - - vgic_put_irq(vcpu->kvm, irq); - } - - return val; -} - -/* - * We currently don't handle changing the priority of an interrupt that - * is already pending on a VCPU. If there is a need for this, we would - * need to make this VCPU exit and re-evaluate the priorities, potentially - * leading to this interrupt getting presented now to the guest (if it has - * been masked by the priority mask before). - */ -void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - unsigned long flags; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - /* Narrow the priority range to what we actually support */ - irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); - if (irq->hw && vgic_irq_is_sgi(irq->intid)) - vgic_update_vsgi(irq); - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } -} - -unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 2); - u32 value = 0; - int i; - - for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->config == VGIC_CONFIG_EDGE) - value |= (2U << (i * 2)); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -void vgic_mmio_write_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 2); - int i; - unsigned long flags; - - for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq; - - /* - * The configuration cannot be changed for SGIs in general, - * for PPIs this is IMPLEMENTATION DEFINED. The arch timer - * code relies on PPIs being level triggered, so we also - * make them read-only here. - */ - if (intid + i < VGIC_NR_PRIVATE_IRQS) - continue; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - if (test_bit(i * 2 + 1, &val)) - irq->config = VGIC_CONFIG_EDGE; - else - irq->config = VGIC_CONFIG_LEVEL; - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) -{ - int i; - u64 val = 0; - int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - - for (i = 0; i < 32; i++) { - struct vgic_irq *irq; - - if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) - continue; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) - val |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return val; -} - -void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val) -{ - int i; - int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - unsigned long flags; - - for (i = 0; i < 32; i++) { - struct vgic_irq *irq; - bool new_level; - - if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) - continue; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - /* - * Line level is set irrespective of irq type - * (level or edge) to avoid dependency that VM should - * restore irq config before line level. - */ - new_level = !!(val & (1U << i)); - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->line_level = new_level; - if (new_level) - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - else - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } -} - -static int match_region(const void *key, const void *elt) -{ - const unsigned int offset = (unsigned long)key; - const struct vgic_register_region *region = elt; - - if (offset < region->reg_offset) - return -1; - - if (offset >= region->reg_offset + region->len) - return 1; - - return 0; -} - -const struct vgic_register_region * -vgic_find_mmio_region(const struct vgic_register_region *regions, - int nr_regions, unsigned int offset) -{ - return bsearch((void *)(uintptr_t)offset, regions, nr_regions, - sizeof(regions[0]), match_region); -} - -void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_vmcr(vcpu, vmcr); - else - vgic_v3_set_vmcr(vcpu, vmcr); -} - -void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_get_vmcr(vcpu, vmcr); - else - vgic_v3_get_vmcr(vcpu, vmcr); -} - -/* - * kvm_mmio_read_buf() returns a value in a format where it can be converted - * to a byte array and be directly observed as the guest wanted it to appear - * in memory if it had done the store itself, which is LE for the GIC, as the - * guest knows the GIC is always LE. - * - * We convert this value to the CPUs native format to deal with it as a data - * value. - */ -unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len) -{ - unsigned long data = kvm_mmio_read_buf(val, len); - - switch (len) { - case 1: - return data; - case 2: - return le16_to_cpu(data); - case 4: - return le32_to_cpu(data); - default: - return le64_to_cpu(data); - } -} - -/* - * kvm_mmio_write_buf() expects a value in a format such that if converted to - * a byte array it is observed as the guest would see it if it could perform - * the load directly. Since the GIC is LE, and the guest knows this, the - * guest expects a value in little endian format. - * - * We convert the data value from the CPUs native format to LE so that the - * value is returned in the proper format. - */ -void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, - unsigned long data) -{ - switch (len) { - case 1: - break; - case 2: - data = cpu_to_le16(data); - break; - case 4: - data = cpu_to_le32(data); - break; - default: - data = cpu_to_le64(data); - } - - kvm_mmio_write_buf(buf, len, data); -} - -static -struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev) -{ - return container_of(dev, struct vgic_io_device, dev); -} - -static bool check_region(const struct kvm *kvm, - const struct vgic_register_region *region, - gpa_t addr, int len) -{ - int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - - switch (len) { - case sizeof(u8): - flags = VGIC_ACCESS_8bit; - break; - case sizeof(u32): - flags = VGIC_ACCESS_32bit; - break; - case sizeof(u64): - flags = VGIC_ACCESS_64bit; - break; - default: - return false; - } - - if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) { - if (!region->bits_per_irq) - return true; - - /* Do we access a non-allocated IRQ? */ - return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs; - } - - return false; -} - -const struct vgic_register_region * -vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, - gpa_t addr, int len) -{ - const struct vgic_register_region *region; - - region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, - addr - iodev->base_addr); - if (!region || !check_region(vcpu->kvm, region, addr, len)) - return NULL; - - return region; -} - -static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, u32 *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - struct kvm_vcpu *r_vcpu; - - region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); - if (!region) { - *val = 0; - return 0; - } - - r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; - if (region->uaccess_read) - *val = region->uaccess_read(r_vcpu, addr, sizeof(u32)); - else - *val = region->read(r_vcpu, addr, sizeof(u32)); - - return 0; -} - -static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, const u32 *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - struct kvm_vcpu *r_vcpu; - - region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); - if (!region) - return 0; - - r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; - if (region->uaccess_write) - return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); - - region->write(r_vcpu, addr, sizeof(u32), *val); - return 0; -} - -/* - * Userland access to VGIC registers. - */ -int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, - bool is_write, int offset, u32 *val) -{ - if (is_write) - return vgic_uaccess_write(vcpu, &dev->dev, offset, val); - else - return vgic_uaccess_read(vcpu, &dev->dev, offset, val); -} - -static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, int len, void *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - unsigned long data = 0; - - region = vgic_get_mmio_region(vcpu, iodev, addr, len); - if (!region) { - memset(val, 0, len); - return 0; - } - - switch (iodev->iodev_type) { - case IODEV_CPUIF: - data = region->read(vcpu, addr, len); - break; - case IODEV_DIST: - data = region->read(vcpu, addr, len); - break; - case IODEV_REDIST: - data = region->read(iodev->redist_vcpu, addr, len); - break; - case IODEV_ITS: - data = region->its_read(vcpu->kvm, iodev->its, addr, len); - break; - } - - vgic_data_host_to_mmio_bus(val, len, data); - return 0; -} - -static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, int len, const void *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - unsigned long data = vgic_data_mmio_bus_to_host(val, len); - - region = vgic_get_mmio_region(vcpu, iodev, addr, len); - if (!region) - return 0; - - switch (iodev->iodev_type) { - case IODEV_CPUIF: - region->write(vcpu, addr, len, data); - break; - case IODEV_DIST: - region->write(vcpu, addr, len, data); - break; - case IODEV_REDIST: - region->write(iodev->redist_vcpu, addr, len, data); - break; - case IODEV_ITS: - region->its_write(vcpu->kvm, iodev->its, addr, len, data); - break; - } - - return 0; -} - -struct kvm_io_device_ops kvm_io_gic_ops = { - .read = dispatch_mmio_read, - .write = dispatch_mmio_write, -}; - -int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, - enum vgic_type type) -{ - struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; - int ret = 0; - unsigned int len; - - switch (type) { - case VGIC_V2: - len = vgic_v2_init_dist_iodev(io_device); - break; - case VGIC_V3: - len = vgic_v3_init_dist_iodev(io_device); - break; - default: - BUG_ON(1); - } - - io_device->base_addr = dist_base_address; - io_device->iodev_type = IODEV_DIST; - io_device->redist_vcpu = NULL; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, - len, &io_device->dev); - mutex_unlock(&kvm->slots_lock); - - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h deleted file mode 100644 index fefcca2b14dc..000000000000 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ /dev/null @@ -1,227 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - */ -#ifndef __KVM_ARM_VGIC_MMIO_H__ -#define __KVM_ARM_VGIC_MMIO_H__ - -struct vgic_register_region { - unsigned int reg_offset; - unsigned int len; - unsigned int bits_per_irq; - unsigned int access_flags; - union { - unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len); - unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len); - }; - union { - void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - void (*its_write)(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val); - }; - unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len); - union { - int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val); - }; -}; - -extern struct kvm_io_device_ops kvm_io_gic_ops; - -#define VGIC_ACCESS_8bit 1 -#define VGIC_ACCESS_32bit 2 -#define VGIC_ACCESS_64bit 4 - -/* - * Generate a mask that covers the number of bytes required to address - * up to 1024 interrupts, each represented by bits. This assumes - * that is a power of two. - */ -#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1) - -/* - * (addr & mask) gives us the _byte_ offset for the INT ID. - * We multiply this by 8 the get the _bit_ offset, then divide this by - * the number of bits to learn the actual INT ID. - * But instead of a division (which requires a "long long div" implementation), - * we shift by the binary logarithm of . - * This assumes that is a power of two. - */ -#define VGIC_ADDR_TO_INTID(addr, bits) (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \ - 8 >> ilog2(bits)) - -/* - * Some VGIC registers store per-IRQ information, with a different number - * of bits per IRQ. For those registers this macro is used. - * The _WITH_LENGTH version instantiates registers with a fixed length - * and is mutually exclusive with the _PER_IRQ version. - */ -#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = bpi, \ - .len = bpi * 1024 / 8, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - .uaccess_read = ur, \ - .uaccess_write = uw, \ - } - -#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = 0, \ - .len = length, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - } - -#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = 0, \ - .len = length, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - .uaccess_read = urd, \ - .uaccess_write = uwr, \ - } - -unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); - -void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, - unsigned long data); - -unsigned long extract_bytes(u64 data, unsigned int offset, - unsigned int num); - -u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - -int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - -unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len); - -void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - -unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, - bool is_write, int offset, u32 *val); - -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); - -void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val); - -unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); - -unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); - -u64 vgic_sanitise_outer_cacheability(u64 reg); -u64 vgic_sanitise_inner_cacheability(u64 reg); -u64 vgic_sanitise_shareability(u64 reg); -u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, - u64 (*sanitise_fn)(u64)); - -/* Find the proper register handler entry given a certain address offset */ -const struct vgic_register_region * -vgic_find_mmio_region(const struct vgic_register_region *regions, - int nr_regions, unsigned int offset); - -#endif diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c deleted file mode 100644 index 621cc168fe3f..000000000000 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ /dev/null @@ -1,504 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015, 2016 ARM Ltd. - */ - -#include -#include -#include -#include -#include - -#include "vgic.h" - -static inline void vgic_v2_write_lr(int lr, u32 val) -{ - void __iomem *base = kvm_vgic_global_state.vctrl_base; - - writel_relaxed(val, base + GICH_LR0 + (lr * 4)); -} - -void vgic_v2_init_lrs(void) -{ - int i; - - for (i = 0; i < kvm_vgic_global_state.nr_lr; i++) - vgic_v2_write_lr(i, 0); -} - -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - - cpuif->vgic_hcr |= GICH_HCR_UIE; -} - -static bool lr_signals_eoi_mi(u32 lr_val) -{ - return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) && - !(lr_val & GICH_LR_HW); -} - -/* - * transfer the content of the LRs back into the corresponding ap_list: - * - active bit is transferred as is - * - pending bit is - * - transferred as is in case of edge sensitive IRQs - * - set to the line-level (resample time) for level sensitive IRQs - */ -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; - int lr; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - - cpuif->vgic_hcr &= ~GICH_HCR_UIE; - - for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { - u32 val = cpuif->vgic_lr[lr]; - u32 cpuid, intid = val & GICH_LR_VIRTUALID; - struct vgic_irq *irq; - - /* Extract the source vCPU id from the LR */ - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - cpuid &= 7; - - /* Notify fds when the guest EOI'ed a level-triggered SPI */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - - raw_spin_lock(&irq->irq_lock); - - /* Always preserve the active bit */ - irq->active = !!(val & GICH_LR_ACTIVE_BIT); - - if (irq->active && vgic_irq_is_sgi(intid)) - irq->active_source = cpuid; - - /* Edge is the only case where we preserve the pending bit */ - if (irq->config == VGIC_CONFIG_EDGE && - (val & GICH_LR_PENDING_BIT)) { - irq->pending_latch = true; - - if (vgic_irq_is_sgi(intid)) - irq->source |= (1 << cpuid); - } - - /* - * Clear soft pending state when level irqs have been acked. - */ - if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) - irq->pending_latch = false; - - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - */ - if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) { - irq->line_level = vgic_get_phys_line_level(irq); - - if (!irq->line_level) - vgic_irq_set_phys_active(irq, false); - } - - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } - - vgic_cpu->used_lrs = 0; -} - -/* - * Populates the particular LR with the state of a given IRQ: - * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq - * - for a level sensitive IRQ the pending state value is unchanged; - * it is dictated directly by the input level - * - * If @irq describes an SGI with multiple sources, we choose the - * lowest-numbered source VCPU and clear that bit in the source bitmap. - * - * The irq_lock must be held by the caller. - */ -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) -{ - u32 val = irq->intid; - bool allow_pending = true; - - if (irq->active) { - val |= GICH_LR_ACTIVE_BIT; - if (vgic_irq_is_sgi(irq->intid)) - val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; - if (vgic_irq_is_multi_sgi(irq)) { - allow_pending = false; - val |= GICH_LR_EOI; - } - } - - if (irq->group) - val |= GICH_LR_GROUP1; - - if (irq->hw) { - val |= GICH_LR_HW; - val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; - /* - * Never set pending+active on a HW interrupt, as the - * pending state is kept at the physical distributor - * level. - */ - if (irq->active) - allow_pending = false; - } else { - if (irq->config == VGIC_CONFIG_LEVEL) { - val |= GICH_LR_EOI; - - /* - * Software resampling doesn't work very well - * if we allow P+A, so let's not do that. - */ - if (irq->active) - allow_pending = false; - } - } - - if (allow_pending && irq_is_pending(irq)) { - val |= GICH_LR_PENDING_BIT; - - if (irq->config == VGIC_CONFIG_EDGE) - irq->pending_latch = false; - - if (vgic_irq_is_sgi(irq->intid)) { - u32 src = ffs(irq->source); - - if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", - irq->intid)) - return; - - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { - irq->pending_latch = true; - val |= GICH_LR_EOI; - } - } - } - - /* - * Level-triggered mapped IRQs are special because we only observe - * rising edges as input to the VGIC. We therefore lower the line - * level here, so that we can take new virtual IRQs. See - * vgic_v2_fold_lr_state for more info. - */ - if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) - irq->line_level = false; - - /* The GICv2 LR only holds five bits of priority. */ - val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; -} - -void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0; -} - -void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - u32 vmcr; - - vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) & - GICH_VMCR_ENABLE_GRP0_MASK; - vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) & - GICH_VMCR_ENABLE_GRP1_MASK; - vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) & - GICH_VMCR_ACK_CTL_MASK; - vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) & - GICH_VMCR_FIQ_EN_MASK; - vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) & - GICH_VMCR_CBPR_MASK; - vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) & - GICH_VMCR_EOI_MODE_MASK; - vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & - GICH_VMCR_ALIAS_BINPOINT_MASK; - vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & - GICH_VMCR_BINPOINT_MASK; - vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) << - GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; - - cpu_if->vgic_vmcr = vmcr; -} - -void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - u32 vmcr; - - vmcr = cpu_if->vgic_vmcr; - - vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >> - GICH_VMCR_ENABLE_GRP0_SHIFT; - vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >> - GICH_VMCR_ENABLE_GRP1_SHIFT; - vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >> - GICH_VMCR_ACK_CTL_SHIFT; - vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >> - GICH_VMCR_FIQ_EN_SHIFT; - vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >> - GICH_VMCR_CBPR_SHIFT; - vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >> - GICH_VMCR_EOI_MODE_SHIFT; - - vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> - GICH_VMCR_ALIAS_BINPOINT_SHIFT; - vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> - GICH_VMCR_BINPOINT_SHIFT; - vmcrp->pmr = ((vmcr & GICH_VMCR_PRIMASK_MASK) >> - GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; -} - -void vgic_v2_enable(struct kvm_vcpu *vcpu) -{ - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; - - /* Get the show on the road... */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; -} - -/* check for overlapping regions and for regions crossing the end of memory */ -static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) -{ - if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base) - return false; - if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base) - return false; - - if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base) - return true; - if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base) - return true; - - return false; -} - -int vgic_v2_map_resources(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int ret = 0; - - if (vgic_ready(kvm)) - goto out; - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || - IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { - kvm_err("Need to set vgic cpu and dist addresses first\n"); - ret = -ENXIO; - goto out; - } - - if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) { - kvm_err("VGIC CPU and dist frames overlap\n"); - ret = -EINVAL; - goto out; - } - - /* - * Initialize the vgic if this hasn't already been done on demand by - * accessing the vgic state from userspace. - */ - ret = vgic_init(kvm); - if (ret) { - kvm_err("Unable to initialize VGIC dynamic data structures\n"); - goto out; - } - - ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2); - if (ret) { - kvm_err("Unable to register VGIC MMIO regions\n"); - goto out; - } - - if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { - ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, - kvm_vgic_global_state.vcpu_base, - KVM_VGIC_V2_CPU_SIZE, true); - if (ret) { - kvm_err("Unable to remap VGIC CPU to VCPU\n"); - goto out; - } - } - - dist->ready = true; - -out: - return ret; -} - -DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap); - -/** - * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller - * @info: pointer to the GIC description - * - * Returns 0 if the VGICv2 has been probed successfully, returns an error code - * otherwise - */ -int vgic_v2_probe(const struct gic_kvm_info *info) -{ - int ret; - u32 vtr; - - if (!info->vctrl.start) { - kvm_err("GICH not present in the firmware table\n"); - return -ENXIO; - } - - if (!PAGE_ALIGNED(info->vcpu.start) || - !PAGE_ALIGNED(resource_size(&info->vcpu))) { - kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n"); - - ret = create_hyp_io_mappings(info->vcpu.start, - resource_size(&info->vcpu), - &kvm_vgic_global_state.vcpu_base_va, - &kvm_vgic_global_state.vcpu_hyp_va); - if (ret) { - kvm_err("Cannot map GICV into hyp\n"); - goto out; - } - - static_branch_enable(&vgic_v2_cpuif_trap); - } - - ret = create_hyp_io_mappings(info->vctrl.start, - resource_size(&info->vctrl), - &kvm_vgic_global_state.vctrl_base, - &kvm_vgic_global_state.vctrl_hyp); - if (ret) { - kvm_err("Cannot map VCTRL into hyp\n"); - goto out; - } - - vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); - kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; - - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); - if (ret) { - kvm_err("Cannot register GICv2 KVM device\n"); - goto out; - } - - kvm_vgic_global_state.can_emulate_gicv2 = true; - kvm_vgic_global_state.vcpu_base = info->vcpu.start; - kvm_vgic_global_state.type = VGIC_V2; - kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; - - kvm_debug("vgic-v2@%llx\n", info->vctrl.start); - - return 0; -out: - if (kvm_vgic_global_state.vctrl_base) - iounmap(kvm_vgic_global_state.vctrl_base); - if (kvm_vgic_global_state.vcpu_base_va) - iounmap(kvm_vgic_global_state.vcpu_base_va); - - return ret; -} - -static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - u64 elrsr; - int i; - - elrsr = readl_relaxed(base + GICH_ELRSR0); - if (unlikely(used_lrs > 32)) - elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32; - - for (i = 0; i < used_lrs; i++) { - if (elrsr & (1UL << i)) - cpu_if->vgic_lr[i] &= ~GICH_LR_STATE; - else - cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4)); - - writel_relaxed(0, base + GICH_LR0 + (i * 4)); - } -} - -void vgic_v2_save_state(struct kvm_vcpu *vcpu) -{ - void __iomem *base = kvm_vgic_global_state.vctrl_base; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - - if (!base) - return; - - if (used_lrs) { - save_lrs(vcpu, base); - writel_relaxed(0, base + GICH_HCR); - } -} - -void vgic_v2_restore_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - void __iomem *base = kvm_vgic_global_state.vctrl_base; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - int i; - - if (!base) - return; - - if (used_lrs) { - writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); - for (i = 0; i < used_lrs; i++) { - writel_relaxed(cpu_if->vgic_lr[i], - base + GICH_LR0 + (i * 4)); - } - } -} - -void vgic_v2_load(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - - writel_relaxed(cpu_if->vgic_vmcr, - kvm_vgic_global_state.vctrl_base + GICH_VMCR); - writel_relaxed(cpu_if->vgic_apr, - kvm_vgic_global_state.vctrl_base + GICH_APR); -} - -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - - cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); -} - -void vgic_v2_put(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - - vgic_v2_vmcr_sync(vcpu); - cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); -} diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c deleted file mode 100644 index 2c9fc13e2c59..000000000000 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ /dev/null @@ -1,693 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include -#include -#include -#include -#include -#include -#include - -#include "vgic.h" - -static bool group0_trap; -static bool group1_trap; -static bool common_trap; -static bool gicv4_enable; - -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - - cpuif->vgic_hcr |= ICH_HCR_UIE; -} - -static bool lr_signals_eoi_mi(u64 lr_val) -{ - return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) && - !(lr_val & ICH_LR_HW); -} - -void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - int lr; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - - cpuif->vgic_hcr &= ~ICH_HCR_UIE; - - for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { - u64 val = cpuif->vgic_lr[lr]; - u32 intid, cpuid; - struct vgic_irq *irq; - bool is_v2_sgi = false; - - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { - intid = val & ICH_LR_VIRTUAL_ID_MASK; - } else { - intid = val & GICH_LR_VIRTUALID; - is_v2_sgi = vgic_irq_is_sgi(intid); - } - - /* Notify fds when the guest EOI'ed a level-triggered IRQ */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - if (!irq) /* An LPI could have been unmapped. */ - continue; - - raw_spin_lock(&irq->irq_lock); - - /* Always preserve the active bit */ - irq->active = !!(val & ICH_LR_ACTIVE_BIT); - - if (irq->active && is_v2_sgi) - irq->active_source = cpuid; - - /* Edge is the only case where we preserve the pending bit */ - if (irq->config == VGIC_CONFIG_EDGE && - (val & ICH_LR_PENDING_BIT)) { - irq->pending_latch = true; - - if (is_v2_sgi) - irq->source |= (1 << cpuid); - } - - /* - * Clear soft pending state when level irqs have been acked. - */ - if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) - irq->pending_latch = false; - - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - */ - if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) { - irq->line_level = vgic_get_phys_line_level(irq); - - if (!irq->line_level) - vgic_irq_set_phys_active(irq, false); - } - - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } - - vgic_cpu->used_lrs = 0; -} - -/* Requires the irq to be locked already */ -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) -{ - u32 model = vcpu->kvm->arch.vgic.vgic_model; - u64 val = irq->intid; - bool allow_pending = true, is_v2_sgi; - - is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && - model == KVM_DEV_TYPE_ARM_VGIC_V2); - - if (irq->active) { - val |= ICH_LR_ACTIVE_BIT; - if (is_v2_sgi) - val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; - if (vgic_irq_is_multi_sgi(irq)) { - allow_pending = false; - val |= ICH_LR_EOI; - } - } - - if (irq->hw) { - val |= ICH_LR_HW; - val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; - /* - * Never set pending+active on a HW interrupt, as the - * pending state is kept at the physical distributor - * level. - */ - if (irq->active) - allow_pending = false; - } else { - if (irq->config == VGIC_CONFIG_LEVEL) { - val |= ICH_LR_EOI; - - /* - * Software resampling doesn't work very well - * if we allow P+A, so let's not do that. - */ - if (irq->active) - allow_pending = false; - } - } - - if (allow_pending && irq_is_pending(irq)) { - val |= ICH_LR_PENDING_BIT; - - if (irq->config == VGIC_CONFIG_EDGE) - irq->pending_latch = false; - - if (vgic_irq_is_sgi(irq->intid) && - model == KVM_DEV_TYPE_ARM_VGIC_V2) { - u32 src = ffs(irq->source); - - if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", - irq->intid)) - return; - - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { - irq->pending_latch = true; - val |= ICH_LR_EOI; - } - } - } - - /* - * Level-triggered mapped IRQs are special because we only observe - * rising edges as input to the VGIC. We therefore lower the line - * level here, so that we can take new virtual IRQs. See - * vgic_v3_fold_lr_state for more info. - */ - if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) - irq->line_level = false; - - if (irq->group) - val |= ICH_LR_GROUP; - - val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; -} - -void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; -} - -void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - u32 vmcr; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { - vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) & - ICH_VMCR_ACK_CTL_MASK; - vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) & - ICH_VMCR_FIQ_EN_MASK; - } else { - /* - * When emulating GICv3 on GICv3 with SRE=1 on the - * VFIQEn bit is RES1 and the VAckCtl bit is RES0. - */ - vmcr = ICH_VMCR_FIQ_EN_MASK; - } - - vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; - vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; - vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; - vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; - vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; - vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; - vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; - - cpu_if->vgic_vmcr = vmcr; -} - -void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - u32 vmcr; - - vmcr = cpu_if->vgic_vmcr; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { - vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >> - ICH_VMCR_ACK_CTL_SHIFT; - vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >> - ICH_VMCR_FIQ_EN_SHIFT; - } else { - /* - * When emulating GICv3 on GICv3 with SRE=1 on the - * VFIQEn bit is RES1 and the VAckCtl bit is RES0. - */ - vmcrp->fiqen = 1; - vmcrp->ackctl = 0; - } - - vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; - vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT; - vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; - vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; - vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; - vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; - vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; -} - -#define INITIAL_PENDBASER_VALUE \ - (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ - GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ - GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) - -void vgic_v3_enable(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; - - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vgic_v3->vgic_vmcr = 0; - - /* - * If we are emulating a GICv3, we do it in an non-GICv2-compatible - * way, so we force SRE to 1 to demonstrate this to the guest. - * Also, we don't support any form of IRQ/FIQ bypass. - * This goes with the spec allowing the value to be RAO/WI. - */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB | - ICC_SRE_EL1_DFB | - ICC_SRE_EL1_SRE); - vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; - } else { - vgic_v3->vgic_sre = 0; - } - - vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_ID_BITS_MASK) >> - ICH_VTR_ID_BITS_SHIFT; - vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_PRI_BITS_MASK) >> - ICH_VTR_PRI_BITS_SHIFT) + 1; - - /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EN; - if (group0_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL0; - if (group1_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL1; - if (common_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TC; -} - -int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) -{ - struct kvm_vcpu *vcpu; - int byte_offset, bit_nr; - gpa_t pendbase, ptr; - bool status; - u8 val; - int ret; - unsigned long flags; - -retry: - vcpu = irq->target_vcpu; - if (!vcpu) - return 0; - - pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); - - byte_offset = irq->intid / BITS_PER_BYTE; - bit_nr = irq->intid % BITS_PER_BYTE; - ptr = pendbase + byte_offset; - - ret = kvm_read_guest_lock(kvm, ptr, &val, 1); - if (ret) - return ret; - - status = val & (1 << bit_nr); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - if (irq->target_vcpu != vcpu) { - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - goto retry; - } - irq->pending_latch = status; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - - if (status) { - /* clear consumed data */ - val &= ~(1 << bit_nr); - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); - if (ret) - return ret; - } - return 0; -} - -/** - * vgic_v3_save_pending_tables - Save the pending tables into guest RAM - * kvm lock and all vcpu lock must be held - */ -int vgic_v3_save_pending_tables(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq; - gpa_t last_ptr = ~(gpa_t)0; - int ret; - u8 val; - - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - int byte_offset, bit_nr; - struct kvm_vcpu *vcpu; - gpa_t pendbase, ptr; - bool stored; - - vcpu = irq->target_vcpu; - if (!vcpu) - continue; - - pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); - - byte_offset = irq->intid / BITS_PER_BYTE; - bit_nr = irq->intid % BITS_PER_BYTE; - ptr = pendbase + byte_offset; - - if (ptr != last_ptr) { - ret = kvm_read_guest_lock(kvm, ptr, &val, 1); - if (ret) - return ret; - last_ptr = ptr; - } - - stored = val & (1U << bit_nr); - if (stored == irq->pending_latch) - continue; - - if (irq->pending_latch) - val |= 1 << bit_nr; - else - val &= ~(1 << bit_nr); - - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); - if (ret) - return ret; - } - return 0; -} - -/** - * vgic_v3_rdist_overlap - check if a region overlaps with any - * existing redistributor region - * - * @kvm: kvm handle - * @base: base of the region - * @size: size of region - * - * Return: true if there is an overlap - */ -bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size) -{ - struct vgic_dist *d = &kvm->arch.vgic; - struct vgic_redist_region *rdreg; - - list_for_each_entry(rdreg, &d->rd_regions, list) { - if ((base + size > rdreg->base) && - (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg))) - return true; - } - return false; -} - -/* - * Check for overlapping regions and for regions crossing the end of memory - * for base addresses which have already been set. - */ -bool vgic_v3_check_base(struct kvm *kvm) -{ - struct vgic_dist *d = &kvm->arch.vgic; - struct vgic_redist_region *rdreg; - - if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && - d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) - return false; - - list_for_each_entry(rdreg, &d->rd_regions, list) { - if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) < - rdreg->base) - return false; - } - - if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base)) - return true; - - return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base, - KVM_VGIC_V3_DIST_SIZE); -} - -/** - * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one - * which has free space to put a new rdist region. - * - * @rd_regions: redistributor region list head - * - * A redistributor regions maps n redistributors, n = region size / (2 x 64kB). - * Stride between redistributors is 0 and regions are filled in the index order. - * - * Return: the redist region handle, if any, that has space to map a new rdist - * region. - */ -struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions) -{ - struct vgic_redist_region *rdreg; - - list_for_each_entry(rdreg, rd_regions, list) { - if (!vgic_v3_redist_region_full(rdreg)) - return rdreg; - } - return NULL; -} - -struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, - u32 index) -{ - struct list_head *rd_regions = &kvm->arch.vgic.rd_regions; - struct vgic_redist_region *rdreg; - - list_for_each_entry(rdreg, rd_regions, list) { - if (rdreg->index == index) - return rdreg; - } - return NULL; -} - - -int vgic_v3_map_resources(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int ret = 0; - int c; - - if (vgic_ready(kvm)) - goto out; - - kvm_for_each_vcpu(c, vcpu, kvm) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { - kvm_debug("vcpu %d redistributor base not set\n", c); - ret = -ENXIO; - goto out; - } - } - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { - kvm_err("Need to set vgic distributor addresses first\n"); - ret = -ENXIO; - goto out; - } - - if (!vgic_v3_check_base(kvm)) { - kvm_err("VGIC redist and dist frames overlap\n"); - ret = -EINVAL; - goto out; - } - - /* - * For a VGICv3 we require the userland to explicitly initialize - * the VGIC before we need to use it. - */ - if (!vgic_initialized(kvm)) { - ret = -EBUSY; - goto out; - } - - ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3); - if (ret) { - kvm_err("Unable to register VGICv3 dist MMIO regions\n"); - goto out; - } - - if (kvm_vgic_global_state.has_gicv4_1) - vgic_v4_configure_vsgis(kvm); - dist->ready = true; - -out: - return ret; -} - -DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); - -static int __init early_group0_trap_cfg(char *buf) -{ - return strtobool(buf, &group0_trap); -} -early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); - -static int __init early_group1_trap_cfg(char *buf) -{ - return strtobool(buf, &group1_trap); -} -early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); - -static int __init early_common_trap_cfg(char *buf) -{ - return strtobool(buf, &common_trap); -} -early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); - -static int __init early_gicv4_enable(char *buf) -{ - return strtobool(buf, &gicv4_enable); -} -early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); - -/** - * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller - * @info: pointer to the GIC description - * - * Returns 0 if the VGICv3 has been probed successfully, returns an error code - * otherwise - */ -int vgic_v3_probe(const struct gic_kvm_info *info) -{ - u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); - int ret; - - /* - * The ListRegs field is 5 bits, but there is a architectural - * maximum of 16 list registers. Just ignore bit 4... - */ - kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; - kvm_vgic_global_state.can_emulate_gicv2 = false; - kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; - - /* GICv4 support? */ - if (info->has_v4) { - kvm_vgic_global_state.has_gicv4 = gicv4_enable; - kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; - kvm_info("GICv4%s support %sabled\n", - kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", - gicv4_enable ? "en" : "dis"); - } - - if (!info->vcpu.start) { - kvm_info("GICv3: no GICV resource entry\n"); - kvm_vgic_global_state.vcpu_base = 0; - } else if (!PAGE_ALIGNED(info->vcpu.start)) { - pr_warn("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)info->vcpu.start); - kvm_vgic_global_state.vcpu_base = 0; - } else { - kvm_vgic_global_state.vcpu_base = info->vcpu.start; - kvm_vgic_global_state.can_emulate_gicv2 = true; - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); - if (ret) { - kvm_err("Cannot register GICv2 KVM device.\n"); - return ret; - } - kvm_info("vgic-v2@%llx\n", info->vcpu.start); - } - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); - if (ret) { - kvm_err("Cannot register GICv3 KVM device.\n"); - kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); - return ret; - } - - if (kvm_vgic_global_state.vcpu_base == 0) - kvm_info("disabling GICv2 emulation\n"); - -#ifdef CONFIG_ARM64 - if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) { - group0_trap = true; - group1_trap = true; - } -#endif - - if (group0_trap || group1_trap || common_trap) { - kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n", - group0_trap ? "G0" : "", - group1_trap ? "G1" : "", - common_trap ? "C" : ""); - static_branch_enable(&vgic_v3_cpuif_trap); - } - - kvm_vgic_global_state.vctrl_base = NULL; - kvm_vgic_global_state.type = VGIC_V3; - kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; - - return 0; -} - -void vgic_v3_load(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - /* - * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen - * is dependent on ICC_SRE_EL1.SRE, and we have to perform the - * VMCR_EL2 save/restore in the world switch. - */ - if (likely(cpu_if->vgic_sre)) - kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); - - kvm_call_hyp(__vgic_v3_restore_aprs, vcpu); - - if (has_vhe()) - __vgic_v3_activate_traps(vcpu); - - WARN_ON(vgic_v4_load(vcpu)); -} - -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - if (likely(cpu_if->vgic_sre)) - cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); -} - -void vgic_v3_put(struct kvm_vcpu *vcpu) -{ - WARN_ON(vgic_v4_put(vcpu, false)); - - vgic_v3_vmcr_sync(vcpu); - - kvm_call_hyp(__vgic_v3_save_aprs, vcpu); - - if (has_vhe()) - __vgic_v3_deactivate_traps(vcpu); -} diff --git a/virt/kvm/arm/vgic/vgic-v4.c b/virt/kvm/arm/vgic/vgic-v4.c deleted file mode 100644 index 27ac833e5ec7..000000000000 --- a/virt/kvm/arm/vgic/vgic-v4.c +++ /dev/null @@ -1,453 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2017 ARM Ltd. - * Author: Marc Zyngier - */ - -#include -#include -#include -#include -#include - -#include "vgic.h" - -/* - * How KVM uses GICv4 (insert rude comments here): - * - * The vgic-v4 layer acts as a bridge between several entities: - * - The GICv4 ITS representation offered by the ITS driver - * - VFIO, which is in charge of the PCI endpoint - * - The virtual ITS, which is the only thing the guest sees - * - * The configuration of VLPIs is triggered by a callback from VFIO, - * instructing KVM that a PCI device has been configured to deliver - * MSIs to a vITS. - * - * kvm_vgic_v4_set_forwarding() is thus called with the routing entry, - * and this is used to find the corresponding vITS data structures - * (ITS instance, device, event and irq) using a process that is - * extremely similar to the injection of an MSI. - * - * At this stage, we can link the guest's view of an LPI (uniquely - * identified by the routing entry) and the host irq, using the GICv4 - * driver mapping operation. Should the mapping succeed, we've then - * successfully upgraded the guest's LPI to a VLPI. We can then start - * with updating GICv4's view of the property table and generating an - * INValidation in order to kickstart the delivery of this VLPI to the - * guest directly, without software intervention. Well, almost. - * - * When the PCI endpoint is deconfigured, this operation is reversed - * with VFIO calling kvm_vgic_v4_unset_forwarding(). - * - * Once the VLPI has been mapped, it needs to follow any change the - * guest performs on its LPI through the vITS. For that, a number of - * command handlers have hooks to communicate these changes to the HW: - * - Any invalidation triggers a call to its_prop_update_vlpi() - * - The INT command results in a irq_set_irqchip_state(), which - * generates an INT on the corresponding VLPI. - * - The CLEAR command results in a irq_set_irqchip_state(), which - * generates an CLEAR on the corresponding VLPI. - * - DISCARD translates into an unmap, similar to a call to - * kvm_vgic_v4_unset_forwarding(). - * - MOVI is translated by an update of the existing mapping, changing - * the target vcpu, resulting in a VMOVI being generated. - * - MOVALL is translated by a string of mapping updates (similar to - * the handling of MOVI). MOVALL is horrible. - * - * Note that a DISCARD/MAPTI sequence emitted from the guest without - * reprogramming the PCI endpoint after MAPTI does not result in a - * VLPI being mapped, as there is no callback from VFIO (the guest - * will get the interrupt via the normal SW injection). Fixing this is - * not trivial, and requires some horrible messing with the VFIO - * internals. Not fun. Don't do that. - * - * Then there is the scheduling. Each time a vcpu is about to run on a - * physical CPU, KVM must tell the corresponding redistributor about - * it. And if we've migrated our vcpu from one CPU to another, we must - * tell the ITS (so that the messages reach the right redistributor). - * This is done in two steps: first issue a irq_set_affinity() on the - * irq corresponding to the vcpu, then call its_make_vpe_resident(). - * You must be in a non-preemptible context. On exit, a call to - * its_make_vpe_non_resident() tells the redistributor that we're done - * with the vcpu. - * - * Finally, the doorbell handling: Each vcpu is allocated an interrupt - * which will fire each time a VLPI is made pending whilst the vcpu is - * not running. Each time the vcpu gets blocked, the doorbell - * interrupt gets enabled. When the vcpu is unblocked (for whatever - * reason), the doorbell interrupt is disabled. - */ - -#define DB_IRQ_FLAGS (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING) - -static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info) -{ - struct kvm_vcpu *vcpu = info; - - /* We got the message, no need to fire again */ - if (!kvm_vgic_global_state.has_gicv4_1 && - !irqd_irq_disabled(&irq_to_desc(irq)->irq_data)) - disable_irq_nosync(irq); - - vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true; - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - - return IRQ_HANDLED; -} - -static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq) -{ - vpe->sgi_config[irq->intid].enabled = irq->enabled; - vpe->sgi_config[irq->intid].group = irq->group; - vpe->sgi_config[irq->intid].priority = irq->priority; -} - -static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu) -{ - struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - int i; - - /* - * With GICv4.1, every virtual SGI can be directly injected. So - * let's pretend that they are HW interrupts, tied to a host - * IRQ. The SGI code will do its magic. - */ - for (i = 0; i < VGIC_NR_SGIS; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); - struct irq_desc *desc; - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - if (irq->hw) - goto unlock; - - irq->hw = true; - irq->host_irq = irq_find_mapping(vpe->sgi_domain, i); - - /* Transfer the full irq state to the vPE */ - vgic_v4_sync_sgi_config(vpe, irq); - desc = irq_to_desc(irq->host_irq); - ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc), - false); - if (!WARN_ON(ret)) { - /* Transfer pending state */ - ret = irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - irq->pending_latch); - WARN_ON(ret); - irq->pending_latch = false; - } - unlock: - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) -{ - int i; - - for (i = 0; i < VGIC_NR_SGIS; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); - struct irq_desc *desc; - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - if (!irq->hw) - goto unlock; - - irq->hw = false; - ret = irq_get_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - &irq->pending_latch); - WARN_ON(ret); - - desc = irq_to_desc(irq->host_irq); - irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); - unlock: - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -/* Must be called with the kvm lock held */ -void vgic_v4_configure_vsgis(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int i; - - kvm_arm_halt_guest(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (dist->nassgireq) - vgic_v4_enable_vsgis(vcpu); - else - vgic_v4_disable_vsgis(vcpu); - } - - kvm_arm_resume_guest(kvm); -} - -/** - * vgic_v4_init - Initialize the GICv4 data structures - * @kvm: Pointer to the VM being initialized - * - * We may be called each time a vITS is created, or when the - * vgic is initialized. This relies on kvm->lock to be - * held. In both cases, the number of vcpus should now be - * fixed. - */ -int vgic_v4_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int i, nr_vcpus, ret; - - if (!kvm_vgic_global_state.has_gicv4) - return 0; /* Nothing to see here... move along. */ - - if (dist->its_vm.vpes) - return 0; - - nr_vcpus = atomic_read(&kvm->online_vcpus); - - dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes), - GFP_KERNEL); - if (!dist->its_vm.vpes) - return -ENOMEM; - - dist->its_vm.nr_vpes = nr_vcpus; - - kvm_for_each_vcpu(i, vcpu, kvm) - dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - - ret = its_alloc_vcpu_irqs(&dist->its_vm); - if (ret < 0) { - kvm_err("VPE IRQ allocation failure\n"); - kfree(dist->its_vm.vpes); - dist->its_vm.nr_vpes = 0; - dist->its_vm.vpes = NULL; - return ret; - } - - kvm_for_each_vcpu(i, vcpu, kvm) { - int irq = dist->its_vm.vpes[i]->irq; - unsigned long irq_flags = DB_IRQ_FLAGS; - - /* - * Don't automatically enable the doorbell, as we're - * flipping it back and forth when the vcpu gets - * blocked. Also disable the lazy disabling, as the - * doorbell could kick us out of the guest too - * early... - * - * On GICv4.1, the doorbell is managed in HW and must - * be left enabled. - */ - if (kvm_vgic_global_state.has_gicv4_1) - irq_flags &= ~IRQ_NOAUTOEN; - irq_set_status_flags(irq, irq_flags); - - ret = request_irq(irq, vgic_v4_doorbell_handler, - 0, "vcpu", vcpu); - if (ret) { - kvm_err("failed to allocate vcpu IRQ%d\n", irq); - /* - * Trick: adjust the number of vpes so we know - * how many to nuke on teardown... - */ - dist->its_vm.nr_vpes = i; - break; - } - } - - if (ret) - vgic_v4_teardown(kvm); - - return ret; -} - -/** - * vgic_v4_teardown - Free the GICv4 data structures - * @kvm: Pointer to the VM being destroyed - * - * Relies on kvm->lock to be held. - */ -void vgic_v4_teardown(struct kvm *kvm) -{ - struct its_vm *its_vm = &kvm->arch.vgic.its_vm; - int i; - - if (!its_vm->vpes) - return; - - for (i = 0; i < its_vm->nr_vpes; i++) { - struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i); - int irq = its_vm->vpes[i]->irq; - - irq_clear_status_flags(irq, DB_IRQ_FLAGS); - free_irq(irq, vcpu); - } - - its_free_vcpu_irqs(its_vm); - kfree(its_vm->vpes); - its_vm->nr_vpes = 0; - its_vm->vpes = NULL; -} - -int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db) -{ - struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - - if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident) - return 0; - - return its_make_vpe_non_resident(vpe, need_db); -} - -int vgic_v4_load(struct kvm_vcpu *vcpu) -{ - struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - int err; - - if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident) - return 0; - - /* - * Before making the VPE resident, make sure the redistributor - * corresponding to our current CPU expects us here. See the - * doc in drivers/irqchip/irq-gic-v4.c to understand how this - * turns into a VMOVP command at the ITS level. - */ - err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id())); - if (err) - return err; - - err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled); - if (err) - return err; - - /* - * Now that the VPE is resident, let's get rid of a potential - * doorbell interrupt that would still be pending. This is a - * GICv4.0 only "feature"... - */ - if (!kvm_vgic_global_state.has_gicv4_1) - err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false); - - return err; -} - -static struct vgic_its *vgic_get_its(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *irq_entry) -{ - struct kvm_msi msi = (struct kvm_msi) { - .address_lo = irq_entry->msi.address_lo, - .address_hi = irq_entry->msi.address_hi, - .data = irq_entry->msi.data, - .flags = irq_entry->msi.flags, - .devid = irq_entry->msi.devid, - }; - - return vgic_msi_to_its(kvm, &msi); -} - -int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, - struct kvm_kernel_irq_routing_entry *irq_entry) -{ - struct vgic_its *its; - struct vgic_irq *irq; - struct its_vlpi_map map; - int ret; - - if (!vgic_supports_direct_msis(kvm)) - return 0; - - /* - * Get the ITS, and escape early on error (not a valid - * doorbell for any of our vITSs). - */ - its = vgic_get_its(kvm, irq_entry); - if (IS_ERR(its)) - return 0; - - mutex_lock(&its->its_lock); - - /* Perform the actual DevID/EventID -> LPI translation. */ - ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, - irq_entry->msi.data, &irq); - if (ret) - goto out; - - /* - * Emit the mapping request. If it fails, the ITS probably - * isn't v4 compatible, so let's silently bail out. Holding - * the ITS lock should ensure that nothing can modify the - * target vcpu. - */ - map = (struct its_vlpi_map) { - .vm = &kvm->arch.vgic.its_vm, - .vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe, - .vintid = irq->intid, - .properties = ((irq->priority & 0xfc) | - (irq->enabled ? LPI_PROP_ENABLED : 0) | - LPI_PROP_GROUP1), - .db_enabled = true, - }; - - ret = its_map_vlpi(virq, &map); - if (ret) - goto out; - - irq->hw = true; - irq->host_irq = virq; - atomic_inc(&map.vpe->vlpi_count); - -out: - mutex_unlock(&its->its_lock); - return ret; -} - -int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq, - struct kvm_kernel_irq_routing_entry *irq_entry) -{ - struct vgic_its *its; - struct vgic_irq *irq; - int ret; - - if (!vgic_supports_direct_msis(kvm)) - return 0; - - /* - * Get the ITS, and escape early on error (not a valid - * doorbell for any of our vITSs). - */ - its = vgic_get_its(kvm, irq_entry); - if (IS_ERR(its)) - return 0; - - mutex_lock(&its->its_lock); - - ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, - irq_entry->msi.data, &irq); - if (ret) - goto out; - - WARN_ON(!(irq->hw && irq->host_irq == virq)); - if (irq->hw) { - atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); - irq->hw = false; - ret = its_unmap_vlpi(virq); - } - -out: - mutex_unlock(&its->its_lock); - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c deleted file mode 100644 index 99b02ca730a8..000000000000 --- a/virt/kvm/arm/vgic/vgic.c +++ /dev/null @@ -1,1011 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015, 2016 ARM Ltd. - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include "vgic.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -struct vgic_global kvm_vgic_global_state __ro_after_init = { - .gicv3_cpuif = STATIC_KEY_FALSE_INIT, -}; - -/* - * Locking order is always: - * kvm->lock (mutex) - * its->cmd_lock (mutex) - * its->its_lock (mutex) - * vgic_cpu->ap_list_lock must be taken with IRQs disabled - * kvm->lpi_list_lock must be taken with IRQs disabled - * vgic_irq->irq_lock must be taken with IRQs disabled - * - * As the ap_list_lock might be taken from the timer interrupt handler, - * we have to disable IRQs before taking this lock and everything lower - * than it. - * - * If you need to take multiple locks, always take the upper lock first, - * then the lower ones, e.g. first take the its_lock, then the irq_lock. - * If you are already holding a lock and need to take a higher one, you - * have to drop the lower ranking lock first and re-aquire it after having - * taken the upper one. - * - * When taking more than one ap_list_lock at the same time, always take the - * lowest numbered VCPU's ap_list_lock first, so: - * vcpuX->vcpu_id < vcpuY->vcpu_id: - * raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); - * raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); - * - * Since the VGIC must support injecting virtual interrupts from ISRs, we have - * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer - * spinlocks for any lock that may be taken while injecting an interrupt. - */ - -/* - * Iterate over the VM's list of mapped LPIs to find the one with a - * matching interrupt ID and return a reference to the IRQ structure. - */ -static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq = NULL; - unsigned long flags; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - if (irq->intid != intid) - continue; - - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() later once it's finished with the IRQ. - */ - vgic_get_irq_kref(irq); - goto out_unlock; - } - irq = NULL; - -out_unlock: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - return irq; -} - -/* - * This looks up the virtual interrupt ID to get the corresponding - * struct vgic_irq. It also increases the refcount, so any caller is expected - * to call vgic_put_irq() once it's finished with this IRQ. - */ -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid) -{ - /* SGIs and PPIs */ - if (intid <= VGIC_MAX_PRIVATE) { - intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); - return &vcpu->arch.vgic_cpu.private_irqs[intid]; - } - - /* SPIs */ - if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { - intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); - return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; - } - - /* LPIs */ - if (intid >= VGIC_MIN_LPI) - return vgic_get_lpi(kvm, intid); - - WARN(1, "Looking up struct vgic_irq for reserved INTID"); - return NULL; -} - -/* - * We can't do anything in here, because we lack the kvm pointer to - * lock and remove the item from the lpi_list. So we keep this function - * empty and use the return value of kref_put() to trigger the freeing. - */ -static void vgic_irq_release(struct kref *ref) -{ -} - -/* - * Drop the refcount on the LPI. Must be called with lpi_list_lock held. - */ -void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - if (!kref_put(&irq->refcount, vgic_irq_release)) - return; - - list_del(&irq->lpi_list); - dist->lpi_list_count--; - - kfree(irq); -} - -void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long flags; - - if (irq->intid < VGIC_MIN_LPI) - return; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - __vgic_put_lpi_locked(kvm, irq); - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); -} - -void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq, *tmp; - unsigned long flags; - - raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); - - list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { - if (irq->intid >= VGIC_MIN_LPI) { - raw_spin_lock(&irq->irq_lock); - list_del(&irq->ap_list); - irq->vcpu = NULL; - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } - } - - raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); -} - -void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) -{ - WARN_ON(irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - pending)); -} - -bool vgic_get_phys_line_level(struct vgic_irq *irq) -{ - bool line_level; - - BUG_ON(!irq->hw); - - if (irq->get_input_level) - return irq->get_input_level(irq->intid); - - WARN_ON(irq_get_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - &line_level)); - return line_level; -} - -/* Set/Clear the physical active state */ -void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) -{ - - BUG_ON(!irq->hw); - WARN_ON(irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_ACTIVE, - active)); -} - -/** - * kvm_vgic_target_oracle - compute the target vcpu for an irq - * - * @irq: The irq to route. Must be already locked. - * - * Based on the current state of the interrupt (enabled, pending, - * active, vcpu and target_vcpu), compute the next vcpu this should be - * given to. Return NULL if this shouldn't be injected at all. - * - * Requires the IRQ lock to be held. - */ -static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) -{ - lockdep_assert_held(&irq->irq_lock); - - /* If the interrupt is active, it must stay on the current vcpu */ - if (irq->active) - return irq->vcpu ? : irq->target_vcpu; - - /* - * If the IRQ is not active but enabled and pending, we should direct - * it to its configured target VCPU. - * If the distributor is disabled, pending interrupts shouldn't be - * forwarded. - */ - if (irq->enabled && irq_is_pending(irq)) { - if (unlikely(irq->target_vcpu && - !irq->target_vcpu->kvm->arch.vgic.enabled)) - return NULL; - - return irq->target_vcpu; - } - - /* If neither active nor pending and enabled, then this IRQ should not - * be queued to any VCPU. - */ - return NULL; -} - -/* - * The order of items in the ap_lists defines how we'll pack things in LRs as - * well, the first items in the list being the first things populated in the - * LRs. - * - * A hard rule is that active interrupts can never be pushed out of the LRs - * (and therefore take priority) since we cannot reliably trap on deactivation - * of IRQs and therefore they have to be present in the LRs. - * - * Otherwise things should be sorted by the priority field and the GIC - * hardware support will take care of preemption of priority groups etc. - * - * Return negative if "a" sorts before "b", 0 to preserve order, and positive - * to sort "b" before "a". - */ -static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); - struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); - bool penda, pendb; - int ret; - - /* - * list_sort may call this function with the same element when - * the list is fairly long. - */ - if (unlikely(irqa == irqb)) - return 0; - - raw_spin_lock(&irqa->irq_lock); - raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); - - if (irqa->active || irqb->active) { - ret = (int)irqb->active - (int)irqa->active; - goto out; - } - - penda = irqa->enabled && irq_is_pending(irqa); - pendb = irqb->enabled && irq_is_pending(irqb); - - if (!penda || !pendb) { - ret = (int)pendb - (int)penda; - goto out; - } - - /* Both pending and enabled, sort by priority */ - ret = irqa->priority - irqb->priority; -out: - raw_spin_unlock(&irqb->irq_lock); - raw_spin_unlock(&irqa->irq_lock); - return ret; -} - -/* Must be called with the ap_list_lock held */ -static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - lockdep_assert_held(&vgic_cpu->ap_list_lock); - - list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); -} - -/* - * Only valid injection if changing level for level-triggered IRQs or for a - * rising edge, and in-kernel connected IRQ lines can only be controlled by - * their owner. - */ -static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner) -{ - if (irq->owner != owner) - return false; - - switch (irq->config) { - case VGIC_CONFIG_LEVEL: - return irq->line_level != level; - case VGIC_CONFIG_EDGE: - return level; - } - - return false; -} - -/* - * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. - * Do the queuing if necessary, taking the right locks in the right order. - * Returns true when the IRQ was queued, false otherwise. - * - * Needs to be entered with the IRQ lock already held, but will return - * with all locks dropped. - */ -bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags) -{ - struct kvm_vcpu *vcpu; - - lockdep_assert_held(&irq->irq_lock); - -retry: - vcpu = vgic_target_oracle(irq); - if (irq->vcpu || !vcpu) { - /* - * If this IRQ is already on a VCPU's ap_list, then it - * cannot be moved or modified and there is no more work for - * us to do. - * - * Otherwise, if the irq is not pending and enabled, it does - * not need to be inserted into an ap_list and there is also - * no more work for us to do. - */ - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - /* - * We have to kick the VCPU here, because we could be - * queueing an edge-triggered interrupt for which we - * get no EOI maintenance interrupt. In that case, - * while the IRQ is already on the VCPU's AP list, the - * VCPU could have EOI'ed the original interrupt and - * won't see this one until it exits for some other - * reason. - */ - if (vcpu) { - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - } - return false; - } - - /* - * We must unlock the irq lock to take the ap_list_lock where - * we are going to insert this new pending interrupt. - */ - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - /* someone can do stuff here, which we re-check below */ - - raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - raw_spin_lock(&irq->irq_lock); - - /* - * Did something change behind our backs? - * - * There are two cases: - * 1) The irq lost its pending state or was disabled behind our - * backs and/or it was queued to another VCPU's ap_list. - * 2) Someone changed the affinity on this irq behind our - * backs and we are now holding the wrong ap_list_lock. - * - * In both cases, drop the locks and retry. - */ - - if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { - raw_spin_unlock(&irq->irq_lock); - raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, - flags); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - goto retry; - } - - /* - * Grab a reference to the irq to reflect the fact that it is - * now in the ap_list. - */ - vgic_get_irq_kref(irq); - list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); - irq->vcpu = vcpu; - - raw_spin_unlock(&irq->irq_lock); - raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - - return true; -} - -/** - * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic - * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs - * @intid: The INTID to inject a new state to. - * @level: Edge-triggered: true: to trigger the interrupt - * false: to ignore the call - * Level-sensitive true: raise the input signal - * false: lower the input signal - * @owner: The opaque pointer to the owner of the IRQ being raised to verify - * that the caller is allowed to inject this IRQ. Userspace - * injections will have owner == NULL. - * - * The VGIC is not concerned with devices being active-LOW or active-HIGH for - * level-sensitive interrupts. You can think of the level parameter as 1 - * being HIGH and 0 being LOW and all devices being active-HIGH. - */ -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level, void *owner) -{ - struct kvm_vcpu *vcpu; - struct vgic_irq *irq; - unsigned long flags; - int ret; - - trace_vgic_update_irq_pending(cpuid, intid, level); - - ret = vgic_lazy_init(kvm); - if (ret) - return ret; - - vcpu = kvm_get_vcpu(kvm, cpuid); - if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS) - return -EINVAL; - - irq = vgic_get_irq(kvm, vcpu, intid); - if (!irq) - return -EINVAL; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - if (!vgic_validate_injection(irq, level, owner)) { - /* Nothing to see here, move along... */ - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(kvm, irq); - return 0; - } - - if (irq->config == VGIC_CONFIG_LEVEL) - irq->line_level = level; - else - irq->pending_latch = true; - - vgic_queue_irq_unlock(kvm, irq, flags); - vgic_put_irq(kvm, irq); - - return 0; -} - -/* @irq->irq_lock must be held */ -static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - unsigned int host_irq, - bool (*get_input_level)(int vindid)) -{ - struct irq_desc *desc; - struct irq_data *data; - - /* - * Find the physical IRQ number corresponding to @host_irq - */ - desc = irq_to_desc(host_irq); - if (!desc) { - kvm_err("%s: no interrupt descriptor\n", __func__); - return -EINVAL; - } - data = irq_desc_get_irq_data(desc); - while (data->parent_data) - data = data->parent_data; - - irq->hw = true; - irq->host_irq = host_irq; - irq->hwintid = data->hwirq; - irq->get_input_level = get_input_level; - return 0; -} - -/* @irq->irq_lock must be held */ -static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) -{ - irq->hw = false; - irq->hwintid = 0; - irq->get_input_level = NULL; -} - -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, - u32 vintid, bool (*get_input_level)(int vindid)) -{ - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); - unsigned long flags; - int ret; - - BUG_ON(!irq); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level); - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - - return ret; -} - -/** - * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ - * @vcpu: The VCPU pointer - * @vintid: The INTID of the interrupt - * - * Reset the active and pending states of a mapped interrupt. Kernel - * subsystems injecting mapped interrupts should reset their interrupt lines - * when we are doing a reset of the VM. - */ -void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid) -{ - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); - unsigned long flags; - - if (!irq->hw) - goto out; - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->active = false; - irq->pending_latch = false; - irq->line_level = false; - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); -out: - vgic_put_irq(vcpu->kvm, irq); -} - -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) -{ - struct vgic_irq *irq; - unsigned long flags; - - if (!vgic_initialized(vcpu->kvm)) - return -EAGAIN; - - irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); - BUG_ON(!irq); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - kvm_vgic_unmap_irq(irq); - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - - return 0; -} - -/** - * kvm_vgic_set_owner - Set the owner of an interrupt for a VM - * - * @vcpu: Pointer to the VCPU (used for PPIs) - * @intid: The virtual INTID identifying the interrupt (PPI or SPI) - * @owner: Opaque pointer to the owner - * - * Returns 0 if intid is not already used by another in-kernel device and the - * owner is set, otherwise returns an error code. - */ -int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) -{ - struct vgic_irq *irq; - unsigned long flags; - int ret = 0; - - if (!vgic_initialized(vcpu->kvm)) - return -EAGAIN; - - /* SGIs and LPIs cannot be wired up to any device */ - if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid)) - return -EINVAL; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - raw_spin_lock_irqsave(&irq->irq_lock, flags); - if (irq->owner && irq->owner != owner) - ret = -EEXIST; - else - irq->owner = owner; - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - return ret; -} - -/** - * vgic_prune_ap_list - Remove non-relevant interrupts from the list - * - * @vcpu: The VCPU pointer - * - * Go over the list of "interesting" interrupts, and prune those that we - * won't have to consider in the near future. - */ -static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq, *tmp; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - -retry: - raw_spin_lock(&vgic_cpu->ap_list_lock); - - list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { - struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; - bool target_vcpu_needs_kick = false; - - raw_spin_lock(&irq->irq_lock); - - BUG_ON(vcpu != irq->vcpu); - - target_vcpu = vgic_target_oracle(irq); - - if (!target_vcpu) { - /* - * We don't need to process this interrupt any - * further, move it off the list. - */ - list_del(&irq->ap_list); - irq->vcpu = NULL; - raw_spin_unlock(&irq->irq_lock); - - /* - * This vgic_put_irq call matches the - * vgic_get_irq_kref in vgic_queue_irq_unlock, - * where we added the LPI to the ap_list. As - * we remove the irq from the list, we drop - * also drop the refcount. - */ - vgic_put_irq(vcpu->kvm, irq); - continue; - } - - if (target_vcpu == vcpu) { - /* We're on the right CPU */ - raw_spin_unlock(&irq->irq_lock); - continue; - } - - /* This interrupt looks like it has to be migrated. */ - - raw_spin_unlock(&irq->irq_lock); - raw_spin_unlock(&vgic_cpu->ap_list_lock); - - /* - * Ensure locking order by always locking the smallest - * ID first. - */ - if (vcpu->vcpu_id < target_vcpu->vcpu_id) { - vcpuA = vcpu; - vcpuB = target_vcpu; - } else { - vcpuA = target_vcpu; - vcpuB = vcpu; - } - - raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); - raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, - SINGLE_DEPTH_NESTING); - raw_spin_lock(&irq->irq_lock); - - /* - * If the affinity has been preserved, move the - * interrupt around. Otherwise, it means things have - * changed while the interrupt was unlocked, and we - * need to replay this. - * - * In all cases, we cannot trust the list not to have - * changed, so we restart from the beginning. - */ - if (target_vcpu == vgic_target_oracle(irq)) { - struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu; - - list_del(&irq->ap_list); - irq->vcpu = target_vcpu; - list_add_tail(&irq->ap_list, &new_cpu->ap_list_head); - target_vcpu_needs_kick = true; - } - - raw_spin_unlock(&irq->irq_lock); - raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); - raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); - - if (target_vcpu_needs_kick) { - kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu); - kvm_vcpu_kick(target_vcpu); - } - - goto retry; - } - - raw_spin_unlock(&vgic_cpu->ap_list_lock); -} - -static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_fold_lr_state(vcpu); - else - vgic_v3_fold_lr_state(vcpu); -} - -/* Requires the irq_lock to be held. */ -static inline void vgic_populate_lr(struct kvm_vcpu *vcpu, - struct vgic_irq *irq, int lr) -{ - lockdep_assert_held(&irq->irq_lock); - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_populate_lr(vcpu, irq, lr); - else - vgic_v3_populate_lr(vcpu, irq, lr); -} - -static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_clear_lr(vcpu, lr); - else - vgic_v3_clear_lr(vcpu, lr); -} - -static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_underflow(vcpu); - else - vgic_v3_set_underflow(vcpu); -} - -/* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu, - bool *multi_sgi) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq; - int count = 0; - - *multi_sgi = false; - - lockdep_assert_held(&vgic_cpu->ap_list_lock); - - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - int w; - - raw_spin_lock(&irq->irq_lock); - /* GICv2 SGIs can count for more than one... */ - w = vgic_irq_get_lr_count(irq); - raw_spin_unlock(&irq->irq_lock); - - count += w; - *multi_sgi |= (w > 1); - } - return count; -} - -/* Requires the VCPU's ap_list_lock to be held. */ -static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq; - int count; - bool multi_sgi; - u8 prio = 0xff; - - lockdep_assert_held(&vgic_cpu->ap_list_lock); - - count = compute_ap_list_depth(vcpu, &multi_sgi); - if (count > kvm_vgic_global_state.nr_lr || multi_sgi) - vgic_sort_ap_list(vcpu); - - count = 0; - - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - raw_spin_lock(&irq->irq_lock); - - /* - * If we have multi-SGIs in the pipeline, we need to - * guarantee that they are all seen before any IRQ of - * lower priority. In that case, we need to filter out - * these interrupts by exiting early. This is easy as - * the AP list has been sorted already. - */ - if (multi_sgi && irq->priority > prio) { - _raw_spin_unlock(&irq->irq_lock); - break; - } - - if (likely(vgic_target_oracle(irq) == vcpu)) { - vgic_populate_lr(vcpu, irq, count++); - - if (irq->source) - prio = irq->priority; - } - - raw_spin_unlock(&irq->irq_lock); - - if (count == kvm_vgic_global_state.nr_lr) { - if (!list_is_last(&irq->ap_list, - &vgic_cpu->ap_list_head)) - vgic_set_underflow(vcpu); - break; - } - } - - vcpu->arch.vgic_cpu.used_lrs = count; - - /* Nuke remaining LRs */ - for ( ; count < kvm_vgic_global_state.nr_lr; count++) - vgic_clear_lr(vcpu, count); -} - -static inline bool can_access_vgic_from_kernel(void) -{ - /* - * GICv2 can always be accessed from the kernel because it is - * memory-mapped, and VHE systems can access GICv3 EL2 system - * registers. - */ - return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe(); -} - -static inline void vgic_save_state(struct kvm_vcpu *vcpu) -{ - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - vgic_v2_save_state(vcpu); - else - __vgic_v3_save_state(vcpu); -} - -/* Sync back the hardware VGIC state into our emulation after a guest's run. */ -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - /* An empty ap_list_head implies used_lrs == 0 */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) - return; - - if (can_access_vgic_from_kernel()) - vgic_save_state(vcpu); - - if (vgic_cpu->used_lrs) - vgic_fold_lr_state(vcpu); - vgic_prune_ap_list(vcpu); -} - -static inline void vgic_restore_state(struct kvm_vcpu *vcpu) -{ - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - vgic_v2_restore_state(vcpu); - else - __vgic_v3_restore_state(vcpu); -} - -/* Flush our emulation state into the GIC hardware before entering the guest. */ -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) -{ - /* - * If there are no virtual interrupts active or pending for this - * VCPU, then there is no work to do and we can bail out without - * taking any lock. There is a potential race with someone injecting - * interrupts to the VCPU, but it is a benign race as the VCPU will - * either observe the new interrupt before or after doing this check, - * and introducing additional synchronization mechanism doesn't change - * this. - * - * Note that we still need to go through the whole thing if anything - * can be directly injected (GICv4). - */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && - !vgic_supports_direct_msis(vcpu->kvm)) - return; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - - if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { - raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); - vgic_flush_lr_state(vcpu); - raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); - } - - if (can_access_vgic_from_kernel()) - vgic_restore_state(vcpu); -} - -void kvm_vgic_load(struct kvm_vcpu *vcpu) -{ - if (unlikely(!vgic_initialized(vcpu->kvm))) - return; - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_load(vcpu); - else - vgic_v3_load(vcpu); -} - -void kvm_vgic_put(struct kvm_vcpu *vcpu) -{ - if (unlikely(!vgic_initialized(vcpu->kvm))) - return; - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_put(vcpu); - else - vgic_v3_put(vcpu); -} - -void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu) -{ - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) - return; - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_vmcr_sync(vcpu); - else - vgic_v3_vmcr_sync(vcpu); -} - -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq; - bool pending = false; - unsigned long flags; - struct vgic_vmcr vmcr; - - if (!vcpu->kvm->arch.vgic.enabled) - return false; - - if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) - return true; - - vgic_get_vmcr(vcpu, &vmcr); - - raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); - - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - raw_spin_lock(&irq->irq_lock); - pending = irq_is_pending(irq) && irq->enabled && - !irq->active && - irq->priority < vmcr.pmr; - raw_spin_unlock(&irq->irq_lock); - - if (pending) - break; - } - - raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); - - return pending; -} - -void vgic_kick_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int c; - - /* - * We've injected an interrupt, time to find out who deserves - * a good kick... - */ - kvm_for_each_vcpu(c, vcpu, kvm) { - if (kvm_vgic_vcpu_pending_irq(vcpu)) { - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - } - } -} - -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) -{ - struct vgic_irq *irq; - bool map_is_active; - unsigned long flags; - - if (!vgic_initialized(vcpu->kvm)) - return false; - - irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); - raw_spin_lock_irqsave(&irq->irq_lock, flags); - map_is_active = irq->hw && irq->active; - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - - return map_is_active; -} diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h deleted file mode 100644 index 769e4802645e..000000000000 --- a/virt/kvm/arm/vgic/vgic.h +++ /dev/null @@ -1,321 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - */ -#ifndef __KVM_ARM_VGIC_NEW_H__ -#define __KVM_ARM_VGIC_NEW_H__ - -#include - -#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ -#define IMPLEMENTER_ARM 0x43b - -#define VGIC_ADDR_UNDEF (-1) -#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) - -#define INTERRUPT_ID_BITS_SPIS 10 -#define INTERRUPT_ID_BITS_ITS 16 -#define VGIC_PRI_BITS 5 - -#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) - -#define VGIC_AFFINITY_0_SHIFT 0 -#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT) -#define VGIC_AFFINITY_1_SHIFT 8 -#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT) -#define VGIC_AFFINITY_2_SHIFT 16 -#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT) -#define VGIC_AFFINITY_3_SHIFT 24 -#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT) - -#define VGIC_AFFINITY_LEVEL(reg, level) \ - ((((reg) & VGIC_AFFINITY_## level ##_MASK) \ - >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) - -/* - * The Userspace encodes the affinity differently from the MPIDR, - * Below macro converts vgic userspace format to MPIDR reg format. - */ -#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \ - VGIC_AFFINITY_LEVEL(val, 1) | \ - VGIC_AFFINITY_LEVEL(val, 2) | \ - VGIC_AFFINITY_LEVEL(val, 3)) - -/* - * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt, - * below macros are defined for CPUREG encoding. - */ -#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 -#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14 -#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800 -#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11 -#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780 -#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7 -#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078 -#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3 -#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007 -#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0 - -#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \ - KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \ - KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \ - KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \ - KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) - -/* - * As per Documentation/virt/kvm/devices/arm-vgic-its.txt, - * below macros are defined for ITS table entry encoding. - */ -#define KVM_ITS_CTE_VALID_SHIFT 63 -#define KVM_ITS_CTE_VALID_MASK BIT_ULL(63) -#define KVM_ITS_CTE_RDBASE_SHIFT 16 -#define KVM_ITS_CTE_ICID_MASK GENMASK_ULL(15, 0) -#define KVM_ITS_ITE_NEXT_SHIFT 48 -#define KVM_ITS_ITE_PINTID_SHIFT 16 -#define KVM_ITS_ITE_PINTID_MASK GENMASK_ULL(47, 16) -#define KVM_ITS_ITE_ICID_MASK GENMASK_ULL(15, 0) -#define KVM_ITS_DTE_VALID_SHIFT 63 -#define KVM_ITS_DTE_VALID_MASK BIT_ULL(63) -#define KVM_ITS_DTE_NEXT_SHIFT 49 -#define KVM_ITS_DTE_NEXT_MASK GENMASK_ULL(62, 49) -#define KVM_ITS_DTE_ITTADDR_SHIFT 5 -#define KVM_ITS_DTE_ITTADDR_MASK GENMASK_ULL(48, 5) -#define KVM_ITS_DTE_SIZE_MASK GENMASK_ULL(4, 0) -#define KVM_ITS_L1E_VALID_MASK BIT_ULL(63) -/* we only support 64 kB translation table page size */ -#define KVM_ITS_L1E_ADDR_MASK GENMASK_ULL(51, 16) - -#define KVM_VGIC_V3_RDIST_INDEX_MASK GENMASK_ULL(11, 0) -#define KVM_VGIC_V3_RDIST_FLAGS_MASK GENMASK_ULL(15, 12) -#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT 12 -#define KVM_VGIC_V3_RDIST_BASE_MASK GENMASK_ULL(51, 16) -#define KVM_VGIC_V3_RDIST_COUNT_MASK GENMASK_ULL(63, 52) -#define KVM_VGIC_V3_RDIST_COUNT_SHIFT 52 - -#ifdef CONFIG_DEBUG_SPINLOCK -#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) -#else -#define DEBUG_SPINLOCK_BUG_ON(p) -#endif - -/* Requires the irq_lock to be held by the caller. */ -static inline bool irq_is_pending(struct vgic_irq *irq) -{ - if (irq->config == VGIC_CONFIG_EDGE) - return irq->pending_latch; - else - return irq->pending_latch || irq->line_level; -} - -static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq) -{ - return irq->config == VGIC_CONFIG_LEVEL && irq->hw; -} - -static inline int vgic_irq_get_lr_count(struct vgic_irq *irq) -{ - /* Account for the active state as an interrupt */ - if (vgic_irq_is_sgi(irq->intid) && irq->source) - return hweight8(irq->source) + irq->active; - - return irq_is_pending(irq) || irq->active; -} - -static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) -{ - return vgic_irq_get_lr_count(irq) > 1; -} - -/* - * This struct provides an intermediate representation of the fields contained - * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC - * state to userspace can generate either GICv2 or GICv3 CPU interface - * registers regardless of the hardware backed GIC used. - */ -struct vgic_vmcr { - u32 grpen0; - u32 grpen1; - - u32 ackctl; - u32 fiqen; - u32 cbpr; - u32 eoim; - - u32 abpr; - u32 bpr; - u32 pmr; /* Priority mask field in the GICC_PMR and - * ICC_PMR_EL1 priority field format */ -}; - -struct vgic_reg_attr { - struct kvm_vcpu *vcpu; - gpa_t addr; -}; - -int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr); -int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr); -const struct vgic_register_region * -vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, - gpa_t addr, int len); -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid); -void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq); -void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); -bool vgic_get_phys_line_level(struct vgic_irq *irq); -void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); -void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); -bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags); -void vgic_kick_vcpus(struct kvm *kvm); - -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment); - -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); -void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); -void vgic_v2_set_npie(struct kvm_vcpu *vcpu); -int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); -int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_enable(struct kvm_vcpu *vcpu); -int vgic_v2_probe(const struct gic_kvm_info *info); -int vgic_v2_map_resources(struct kvm *kvm); -int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, - enum vgic_type); - -void vgic_v2_init_lrs(void); -void vgic_v2_load(struct kvm_vcpu *vcpu); -void vgic_v2_put(struct kvm_vcpu *vcpu); -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); - -void vgic_v2_save_state(struct kvm_vcpu *vcpu); -void vgic_v2_restore_state(struct kvm_vcpu *vcpu); - -static inline void vgic_get_irq_kref(struct vgic_irq *irq) -{ - if (irq->intid < VGIC_MIN_LPI) - return; - - kref_get(&irq->refcount); -} - -void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); -void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); -void vgic_v3_set_npie(struct kvm_vcpu *vcpu); -void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_enable(struct kvm_vcpu *vcpu); -int vgic_v3_probe(const struct gic_kvm_info *info); -int vgic_v3_map_resources(struct kvm *kvm); -int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); -int vgic_v3_save_pending_tables(struct kvm *kvm); -int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count); -int vgic_register_redist_iodev(struct kvm_vcpu *vcpu); -bool vgic_v3_check_base(struct kvm *kvm); - -void vgic_v3_load(struct kvm_vcpu *vcpu); -void vgic_v3_put(struct kvm_vcpu *vcpu); -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu); - -bool vgic_has_its(struct kvm *kvm); -int kvm_vgic_register_its_device(void); -void vgic_enable_lpis(struct kvm_vcpu *vcpu); -void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); -int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); -int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); -int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u64 id, u64 *val); -int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg); -int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val); -int kvm_register_vgic_device(unsigned long type); -void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -int vgic_lazy_init(struct kvm *kvm); -int vgic_init(struct kvm *kvm); - -void vgic_debug_init(struct kvm *kvm); -void vgic_debug_destroy(struct kvm *kvm); - -bool lock_all_vcpus(struct kvm *kvm); -void unlock_all_vcpus(struct kvm *kvm); - -static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; - - /* - * num_pri_bits are initialized with HW supported values. - * We can rely safely on num_pri_bits even if VM has not - * restored ICC_CTLR_EL1 before restoring APnR registers. - */ - switch (cpu_if->num_pri_bits) { - case 7: return 3; - case 6: return 1; - default: return 0; - } -} - -static inline bool -vgic_v3_redist_region_full(struct vgic_redist_region *region) -{ - if (!region->count) - return false; - - return (region->free_index >= region->count); -} - -struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs); - -static inline size_t -vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) -{ - if (!rdreg->count) - return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE; - else - return rdreg->count * KVM_VGIC_V3_REDIST_SIZE; -} - -struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, - u32 index); - -bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); - -static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) -{ - struct vgic_dist *d = &kvm->arch.vgic; - - return (base + size > d->vgic_dist_base) && - (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); -} - -int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr); -int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid, struct vgic_irq **irq); -struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); -int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi); -void vgic_lpi_translation_cache_init(struct kvm *kvm); -void vgic_lpi_translation_cache_destroy(struct kvm *kvm); -void vgic_its_invalidate_cache(struct kvm *kvm); - -bool vgic_supports_direct_msis(struct kvm *kvm); -int vgic_v4_init(struct kvm *kvm); -void vgic_v4_teardown(struct kvm *kvm); -void vgic_v4_configure_vsgis(struct kvm *kvm); - -#endif -- cgit v1.2.3 From d82755b2e781c8989614c82df7582f5649e265b8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 5 May 2020 16:45:17 +0100 Subject: KVM: arm64: Kill off CONFIG_KVM_ARM_HOST CONFIG_KVM_ARM_HOST is just a proxy for CONFIG_KVM, so remove it in favour of the latter. Signed-off-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200505154520.194120-2-tabba@google.com --- arch/arm64/kernel/asm-offsets.c | 2 +- arch/arm64/kernel/cpu_errata.c | 2 +- arch/arm64/kernel/smp.c | 2 +- arch/arm64/kvm/Kconfig | 6 ----- arch/arm64/kvm/Makefile | 52 ++++++++++++++++++++--------------------- arch/arm64/kvm/hyp/Makefile | 22 ++++++++--------- 6 files changed, 40 insertions(+), 46 deletions(-) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 9981a0a5a87f..a27e0cd731e9 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -96,7 +96,7 @@ int main(void) DEFINE(CPU_BOOT_PTRAUTH_KEY, offsetof(struct secondary_data, ptrauth_key)); #endif BLANK(); -#ifdef CONFIG_KVM_ARM_HOST +#ifdef CONFIG_KVM DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags)); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index df56d2295d16..a102321fc8a2 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -234,7 +234,7 @@ static int detect_harden_bp_fw(void) smccc_end = NULL; break; -#if IS_ENABLED(CONFIG_KVM_ARM_HOST) +#if IS_ENABLED(CONFIG_KVM) case SMCCC_CONDUIT_SMC: cb = call_smc_arch_workaround_1; smccc_start = __smccc_workaround_1_smc; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 061f60fe452f..0a3045d9f33f 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -430,7 +430,7 @@ static void __init hyp_mode_check(void) "CPU: CPUs started in inconsistent modes"); else pr_info("CPU: All CPU(s) started at EL1\n"); - if (IS_ENABLED(CONFIG_KVM_ARM_HOST)) + if (IS_ENABLED(CONFIG_KVM)) kvm_compute_layout(); } diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 449386d76441..ce724e526689 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -28,7 +28,6 @@ config KVM select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO - select KVM_ARM_HOST select KVM_GENERIC_DIRTYLOG_READ_PROTECT select SRCU select KVM_VFIO @@ -50,11 +49,6 @@ config KVM If unsure, say N. -config KVM_ARM_HOST - bool - ---help--- - Provides host support for ARM processors. - config KVM_ARM_PMU bool ---help--- diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 7a3768538343..419696e615b3 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -7,33 +7,33 @@ ccflags-y += -I $(srctree)/$(src) KVM=../../../virt/kvm -obj-$(CONFIG_KVM_ARM_HOST) += kvm.o -obj-$(CONFIG_KVM_ARM_HOST) += hyp/ +obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM) += hyp/ -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/eventfd.o $(KVM)/vfio.o $(KVM)/irqchip.o -kvm-$(CONFIG_KVM_ARM_HOST) += arm.o mmu.o mmio.o -kvm-$(CONFIG_KVM_ARM_HOST) += psci.o perf.o -kvm-$(CONFIG_KVM_ARM_HOST) += hypercalls.o -kvm-$(CONFIG_KVM_ARM_HOST) += pvtime.o +kvm-$(CONFIG_KVM) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o +kvm-$(CONFIG_KVM) += $(KVM)/eventfd.o $(KVM)/vfio.o $(KVM)/irqchip.o +kvm-$(CONFIG_KVM) += arm.o mmu.o mmio.o +kvm-$(CONFIG_KVM) += psci.o perf.o +kvm-$(CONFIG_KVM) += hypercalls.o +kvm-$(CONFIG_KVM) += pvtime.o -kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o -kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o -kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o pmu.o -kvm-$(CONFIG_KVM_ARM_HOST) += aarch32.o -kvm-$(CONFIG_KVM_ARM_HOST) += arch_timer.o +kvm-$(CONFIG_KVM) += inject_fault.o regmap.o va_layout.o +kvm-$(CONFIG_KVM) += hyp.o hyp-init.o handle_exit.o +kvm-$(CONFIG_KVM) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o +kvm-$(CONFIG_KVM) += vgic-sys-reg-v3.o fpsimd.o pmu.o +kvm-$(CONFIG_KVM) += aarch32.o +kvm-$(CONFIG_KVM) += arch_timer.o kvm-$(CONFIG_KVM_ARM_PMU) += pmu-emul.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-init.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-irqfd.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v2.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v3.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v4.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v2.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v3.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-kvm-device.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-its.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-debug.o +kvm-$(CONFIG_KVM) += vgic/vgic.o +kvm-$(CONFIG_KVM) += vgic/vgic-init.o +kvm-$(CONFIG_KVM) += vgic/vgic-irqfd.o +kvm-$(CONFIG_KVM) += vgic/vgic-v2.o +kvm-$(CONFIG_KVM) += vgic/vgic-v3.o +kvm-$(CONFIG_KVM) += vgic/vgic-v4.o +kvm-$(CONFIG_KVM) += vgic/vgic-mmio.o +kvm-$(CONFIG_KVM) += vgic/vgic-mmio-v2.o +kvm-$(CONFIG_KVM) += vgic/vgic-mmio-v3.o +kvm-$(CONFIG_KVM) += vgic/vgic-kvm-device.o +kvm-$(CONFIG_KVM) += vgic/vgic-its.o +kvm-$(CONFIG_KVM) += vgic/vgic-debug.o diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index dc18274a6826..8229e47ba870 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -6,17 +6,17 @@ ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += aarch32.o -obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-cpuif-proxy.o -obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += entry.o -obj-$(CONFIG_KVM_ARM_HOST) += switch.o -obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o -obj-$(CONFIG_KVM_ARM_HOST) += tlb.o -obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o +obj-$(CONFIG_KVM) += vgic-v3-sr.o +obj-$(CONFIG_KVM) += timer-sr.o +obj-$(CONFIG_KVM) += aarch32.o +obj-$(CONFIG_KVM) += vgic-v2-cpuif-proxy.o +obj-$(CONFIG_KVM) += sysreg-sr.o +obj-$(CONFIG_KVM) += debug-sr.o +obj-$(CONFIG_KVM) += entry.o +obj-$(CONFIG_KVM) += switch.o +obj-$(CONFIG_KVM) += fpsimd.o +obj-$(CONFIG_KVM) += tlb.o +obj-$(CONFIG_KVM) += hyp-entry.o # KVM code is run at a different exception code with a different map, so # compiler instrumentation that inserts callbacks or checks into the code may -- cgit v1.2.3 From bf7bc1df30f6c6afa34d4d1d53e1c8ad93510d3e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 5 May 2020 16:45:18 +0100 Subject: KVM: arm64: Update help text arm64 KVM supports 16k pages since 02e0b7600f83 ("arm64: kvm: Add support for 16K pages"), so update the Kconfig help text accordingly. Signed-off-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200505154520.194120-3-tabba@google.com --- arch/arm64/kvm/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index ce724e526689..d2cf4f099454 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -44,8 +44,6 @@ config KVM select TASK_DELAY_ACCT ---help--- Support hosting virtualized guest machines. - We don't support KVM with 16K page tables yet, due to the multiple - levels of fake page tables. If unsure, say N. -- cgit v1.2.3 From f26133624d602b0d984815168a2d3a1f630b02e2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 5 May 2020 16:45:19 +0100 Subject: KVM: arm64: Change CONFIG_KVM to a menuconfig entry Changing CONFIG_KVM to be a 'menuconfig' entry in Kconfig mean that we can straightforwardly enumerate optional features, such as the virtual PMU device as dependent options. Signed-off-by: Will Deacon Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200505154520.194120-4-tabba@google.com --- arch/arm64/kvm/Kconfig | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index d2cf4f099454..f1c1f981482c 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -3,7 +3,6 @@ # KVM configuration # -source "virt/kvm/Kconfig" source "virt/lib/Kconfig" menuconfig VIRTUALIZATION @@ -18,7 +17,7 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM +menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF # for TASKSTATS/TASK_DELAY_ACCT: @@ -33,7 +32,6 @@ config KVM select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD - select KVM_ARM_PMU if HW_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING @@ -47,13 +45,21 @@ config KVM If unsure, say N. +if KVM + +source "virt/kvm/Kconfig" + config KVM_ARM_PMU - bool + bool "Virtual Performance Monitoring Unit (PMU) support" + depends on HW_PERF_EVENTS + default y ---help--- Adds support for a virtual Performance Monitoring Unit (PMU) in virtual machines. config KVM_INDIRECT_VECTORS - def_bool KVM && (HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS) + def_bool HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS + +endif # KVM endif # VIRTUALIZATION -- cgit v1.2.3 From 25357de01b95140ecacd4d9347d74df2dda789f2 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 5 May 2020 16:45:20 +0100 Subject: KVM: arm64: Clean up kvm makefiles Consolidate references to the CONFIG_KVM configuration item to encompass entire folders rather than per line. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200505154520.194120-5-tabba@google.com --- arch/arm64/kvm/Makefile | 38 +++++++++++++------------------------- arch/arm64/kvm/hyp/Makefile | 15 ++++----------- 2 files changed, 17 insertions(+), 36 deletions(-) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 419696e615b3..8d3d9513cbfe 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -10,30 +10,18 @@ KVM=../../../virt/kvm obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += hyp/ -kvm-$(CONFIG_KVM) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o -kvm-$(CONFIG_KVM) += $(KVM)/eventfd.o $(KVM)/vfio.o $(KVM)/irqchip.o -kvm-$(CONFIG_KVM) += arm.o mmu.o mmio.o -kvm-$(CONFIG_KVM) += psci.o perf.o -kvm-$(CONFIG_KVM) += hypercalls.o -kvm-$(CONFIG_KVM) += pvtime.o +kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ + $(KVM)/vfio.o $(KVM)/irqchip.o \ + arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \ + inject_fault.o regmap.o va_layout.o hyp.o hyp-init.o handle_exit.o \ + guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o \ + vgic-sys-reg-v3.o fpsimd.o pmu.o \ + aarch32.o arch_timer.o \ + vgic/vgic.o vgic/vgic-init.o \ + vgic/vgic-irqfd.o vgic/vgic-v2.o \ + vgic/vgic-v3.o vgic/vgic-v4.o \ + vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ + vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ + vgic/vgic-its.o vgic/vgic-debug.o -kvm-$(CONFIG_KVM) += inject_fault.o regmap.o va_layout.o -kvm-$(CONFIG_KVM) += hyp.o hyp-init.o handle_exit.o -kvm-$(CONFIG_KVM) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o -kvm-$(CONFIG_KVM) += vgic-sys-reg-v3.o fpsimd.o pmu.o -kvm-$(CONFIG_KVM) += aarch32.o -kvm-$(CONFIG_KVM) += arch_timer.o kvm-$(CONFIG_KVM_ARM_PMU) += pmu-emul.o - -kvm-$(CONFIG_KVM) += vgic/vgic.o -kvm-$(CONFIG_KVM) += vgic/vgic-init.o -kvm-$(CONFIG_KVM) += vgic/vgic-irqfd.o -kvm-$(CONFIG_KVM) += vgic/vgic-v2.o -kvm-$(CONFIG_KVM) += vgic/vgic-v3.o -kvm-$(CONFIG_KVM) += vgic/vgic-v4.o -kvm-$(CONFIG_KVM) += vgic/vgic-mmio.o -kvm-$(CONFIG_KVM) += vgic/vgic-mmio-v2.o -kvm-$(CONFIG_KVM) += vgic/vgic-mmio-v3.o -kvm-$(CONFIG_KVM) += vgic/vgic-kvm-device.o -kvm-$(CONFIG_KVM) += vgic/vgic-its.o -kvm-$(CONFIG_KVM) += vgic/vgic-debug.o diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 8229e47ba870..8c9880783839 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -6,17 +6,10 @@ ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_KVM) += vgic-v3-sr.o -obj-$(CONFIG_KVM) += timer-sr.o -obj-$(CONFIG_KVM) += aarch32.o -obj-$(CONFIG_KVM) += vgic-v2-cpuif-proxy.o -obj-$(CONFIG_KVM) += sysreg-sr.o -obj-$(CONFIG_KVM) += debug-sr.o -obj-$(CONFIG_KVM) += entry.o -obj-$(CONFIG_KVM) += switch.o -obj-$(CONFIG_KVM) += fpsimd.o -obj-$(CONFIG_KVM) += tlb.o -obj-$(CONFIG_KVM) += hyp-entry.o +obj-$(CONFIG_KVM) += hyp.o + +hyp-y := vgic-v3-sr.o timer-sr.o aarch32.o vgic-v2-cpuif-proxy.o sysreg-sr.o \ + debug-sr.o entry.o switch.o fpsimd.o tlb.o hyp-entry.o # KVM code is run at a different exception code with a different map, so # compiler instrumentation that inserts callbacks or checks into the code may -- cgit v1.2.3 From c6fe89ff8b250ad4dc4bed7bd5877bfbc35f4aba Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 13 May 2020 11:58:29 +0100 Subject: KVM: arm64: Simplify __kvm_timer_set_cntvoff implementation Now that this function isn't constrained by the 32bit PCS, let's simplify it by taking a single 64bit offset instead of two 32bit parameters. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 2 +- arch/arm64/kvm/arch_timer.c | 12 +----------- arch/arm64/kvm/hyp/timer-sr.c | 3 +-- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 7c7eeeaab9fa..59e314f38e43 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -64,7 +64,7 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); -extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high); +extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 93bd59b46848..487eba9f87cd 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -451,17 +451,7 @@ out: static void set_cntvoff(u64 cntvoff) { - u32 low = lower_32_bits(cntvoff); - u32 high = upper_32_bits(cntvoff); - - /* - * Since kvm_call_hyp doesn't fully support the ARM PCS especially on - * 32-bit systems, but rather passes register by register shifted one - * place (we put the function address in r0/x0), we cannot simply pass - * a 64-bit value as an argument, but have to split the value in two - * 32-bit halves. - */ - kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); + kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff); } static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c index ff76e6845fe4..fb5c0be33223 100644 --- a/arch/arm64/kvm/hyp/timer-sr.c +++ b/arch/arm64/kvm/hyp/timer-sr.c @@ -10,9 +10,8 @@ #include -void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high) +void __hyp_text __kvm_timer_set_cntvoff(u64 cntvoff) { - u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low; write_sysreg(cntvoff, cntvoff_el2); } -- cgit v1.2.3 From ce6f8f02f9f6786355fa6c79d88b839639dd75d8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 13 May 2020 11:38:28 +0100 Subject: KVM: arm64: Use cpus_have_final_cap for has_vhe() By the time we start using the has_vhe() helper, we have long discovered whether we are running VHE or not. It thus makes sense to use cpus_have_final_cap() instead of cpus_have_const_cap(), which leads to a small text size reduction. Signed-off-by: Marc Zyngier Acked-by: David Brazdil Link: https://lore.kernel.org/r/20200513103828.74580-1-maz@kernel.org --- arch/arm64/include/asm/virt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 61fd26752adc..5051b388c654 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -85,7 +85,7 @@ static inline bool is_kernel_in_hyp_mode(void) static __always_inline bool has_vhe(void) { - if (cpus_have_const_cap(ARM64_HAS_VIRT_HOST_EXTN)) + if (cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN)) return true; return false; -- cgit v1.2.3 From 656012c731fcfd0f770007366e2b952a613745f2 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Wed, 1 Apr 2020 15:03:10 +0100 Subject: KVM: Fix spelling in code comments Fix spelling and typos (e.g., repeated words) in comments. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200401140310.29701-1-tabba@google.com --- arch/arm64/kvm/arm.c | 6 +++--- arch/arm64/kvm/guest.c | 4 ++-- arch/arm64/kvm/hyp/vgic-v3-sr.c | 2 +- arch/arm64/kvm/mmio.c | 2 +- arch/arm64/kvm/mmu.c | 6 +++--- arch/arm64/kvm/psci.c | 6 +++--- arch/arm64/kvm/reset.c | 6 +++--- arch/arm64/kvm/sys_regs.c | 6 +++--- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- virt/kvm/coalesced_mmio.c | 2 +- virt/kvm/eventfd.c | 2 +- virt/kvm/kvm_main.c | 2 +- 12 files changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index c958bb37b769..ee1b5bba1d08 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -455,9 +455,9 @@ void force_vm_exit(const cpumask_t *mask) * * The hardware supports a limited set of values with the value zero reserved * for the host, so we check if an assigned value belongs to a previous - * generation, which which requires us to assign a new value. If we're the - * first to use a VMID for the new generation, we must flush necessary caches - * and TLBs on all CPUs. + * generation, which requires us to assign a new value. If we're the first to + * use a VMID for the new generation, we must flush necessary caches and TLBs + * on all CPUs. */ static bool need_new_vmid_gen(struct kvm_vmid *vmid) { diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 50a279d3ddd7..871d51729b63 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -267,7 +267,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) /* * Vector lengths supported by the host can't currently be * hidden from the guest individually: instead we can only set a - * maxmium via ZCR_EL2.LEN. So, make sure the available vector + * maximum via ZCR_EL2.LEN. So, make sure the available vector * lengths match the set requested exactly up to the requested * maximum: */ @@ -337,7 +337,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region, unsigned int reg_num; unsigned int reqoffset, reqlen; /* User-requested offset and length */ - unsigned int maxlen; /* Maxmimum permitted length */ + unsigned int maxlen; /* Maximum permitted length */ size_t sve_state_size; diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 49fedf6710f9..6b85773e15c4 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -577,7 +577,7 @@ static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp) /* * The priority value is independent of any of the BPR values, so we - * normalize it using the minumal BPR value. This guarantees that no + * normalize it using the minimal BPR value. This guarantees that no * matter what the guest does with its BPR, we can always set/get the * same value of a priority. */ diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index aedfcff99ac5..4e0366759726 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -131,7 +131,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, /* * No valid syndrome? Ask userspace for help if it has - * voluntered to do so, and bail out otherwise. + * volunteered to do so, and bail out otherwise. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { if (vcpu->kvm->arch.return_nisv_io_abort_to_user) { diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index e3b9ee268823..29d8f24df944 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -784,7 +784,7 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, mutex_lock(&kvm_hyp_pgd_mutex); /* - * This assumes that we we have enough space below the idmap + * This assumes that we have enough space below the idmap * page to allocate our VAs. If not, the check below will * kick. A potential alternative would be to detect that * overflow and switch to an allocation above the idmap. @@ -964,7 +964,7 @@ static void stage2_unmap_memslot(struct kvm *kvm, * stage2_unmap_vm - Unmap Stage-2 RAM mappings * @kvm: The struct kvm pointer * - * Go through the memregions and unmap any reguler RAM + * Go through the memregions and unmap any regular RAM * backing memory already mapped to the VM. */ void stage2_unmap_vm(struct kvm *kvm) @@ -2262,7 +2262,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, { /* * At this point memslot has been committed and there is an - * allocated dirty_bitmap[], dirty pages will be be tracked while the + * allocated dirty_bitmap[], dirty pages will be tracked while the * memory slot is write protected. */ if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index ae364716ee40..83415e96b589 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -94,7 +94,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) /* * NOTE: We always update r0 (or x0) because for PSCI v0.1 - * the general puspose registers are undefined upon CPU_ON. + * the general purpose registers are undefined upon CPU_ON. */ reset_state->r0 = smccc_get_arg3(source_vcpu); @@ -265,10 +265,10 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) case PSCI_0_2_FN_SYSTEM_OFF: kvm_psci_system_off(vcpu); /* - * We should'nt be going back to guest VCPU after + * We shouldn't be going back to guest VCPU after * receiving SYSTEM_OFF request. * - * If user space accidently/deliberately resumes + * If user space accidentally/deliberately resumes * guest VCPU after SYSTEM_OFF request then guest * VCPU should see internal failure from PSCI return * value. To achieve this, we preload r0 (or x0) with diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 30b7ea680f66..658f3a79617b 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -163,7 +163,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) vl = vcpu->arch.sve_max_vl; /* - * Resposibility for these properties is shared between + * Responsibility for these properties is shared between * kvm_arm_init_arch_resources(), kvm_vcpu_enable_sve() and * set_sve_vls(). Double-check here just to be sure: */ @@ -249,7 +249,7 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) * ioctl or as part of handling a request issued by another VCPU in the PSCI * handling code. In the first case, the VCPU will not be loaded, and in the * second case the VCPU will be loaded. Because this function operates purely - * on the memory-backed valus of system registers, we want to do a full put if + * on the memory-backed values of system registers, we want to do a full put if * we were loaded (handling a request) and load the values back at the end of * the function. Otherwise we leave the state alone. In both cases, we * disable preemption around the vcpu reset as we would otherwise race with @@ -357,7 +357,7 @@ void kvm_set_ipa_limit(void) * * So clamp the ipa limit further down to limit the number of levels. * Since we can concatenate upto 16 tables at entry level, we could - * go upto 4bits above the maximum VA addressible with the current + * go upto 4bits above the maximum VA addressable with the current * number of levels. */ va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 51db934702b6..620eaf11e672 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -34,7 +34,7 @@ #include "trace.h" /* - * All of this file is extremly similar to the ARM coproc.c, but the + * All of this file is extremely similar to the ARM coproc.c, but the * types are different. My gut feeling is that it should be pretty * easy to merge, but that would be an ABI breakage -- again. VFP * would also need to be abstracted. @@ -118,8 +118,8 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) * entry to the guest but are only restored on vcpu_load. * * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but - * should never be listed below, because the the MPIDR should only be - * set once, before running the VCPU, and never changed later. + * should never be listed below, because the MPIDR should only be set + * once, before running the VCPU, and never changed later. */ switch (reg) { case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); return; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 5bc2ab58954b..3ccd6d3cb4d3 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -587,7 +587,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info) int ret; /* - * The ListRegs field is 5 bits, but there is a architectural + * The ListRegs field is 5 bits, but there is an architectural * maximum of 16 list registers. Just ignore bit 4... */ kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 00c747dbc82e..e2c197fd4f9d 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -119,7 +119,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm) /* * We're using this spinlock to sync access to the coalesced ring. - * The list doesn't need it's own lock since device registration and + * The list doesn't need its own lock since device registration and * unregistration should only happen when kvm->slots_lock is held. */ spin_lock_init(&kvm->ring_lock); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 67b6fc153e9c..e586d1395c28 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -116,7 +116,7 @@ irqfd_shutdown(struct work_struct *work) struct kvm *kvm = irqfd->kvm; u64 cnt; - /* Make sure irqfd has been initalized in assign path. */ + /* Make sure irqfd has been initialized in assign path. */ synchronize_srcu(&kvm->irq_srcu); /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 74bdb7bf3295..f57792b1541b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2799,7 +2799,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); * * (a) VCPU which has not done pl-exit or cpu relax intercepted recently * (preempted lock holder), indicated by @in_spin_loop. - * Set at the beiginning and cleared at the end of interception/PLE handler. + * Set at the beginning and cleared at the end of interception/PLE handler. * * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get * chance last time (mostly it has become eligible now since we have probably -- cgit v1.2.3 From 892713e97ca146591515b3c115f99cdf632030fb Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 15 Apr 2020 15:28:35 +0800 Subject: KVM: arm64: Sidestep stage2_unmap_vm() on vcpu reset when S2FWB is supported stage2_unmap_vm() was introduced to unmap user RAM region in the stage2 page table to make the caches coherent. E.g., a guest reboot with stage1 MMU disabled will access memory using non-cacheable attributes. If the RAM and caches are not coherent at this stage, some evicted dirty cache line may go and corrupt guest data in RAM. Since ARMv8.4, S2FWB feature is mandatory and KVM will take advantage of it to configure the stage2 page table and the attributes of memory access. So we ensure that guests always access memory using cacheable attributes and thus, the caches always be coherent. So on CPUs that support S2FWB, we can safely reset the vcpu without a heavy stage2 unmapping. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200415072835.1164-1-yuzenghui@huawei.com --- arch/arm64/kvm/arm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ee1b5bba1d08..0ea9a0266d9a 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -983,8 +983,11 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, /* * Ensure a rebooted VM will fault in RAM pages and detect if the * guest MMU is turned off and flush the caches as needed. + * + * S2FWB enforces all memory accesses to RAM being cacheable, we + * ensure that the cache is always coherent. */ - if (vcpu->arch.has_run_once) + if (vcpu->arch.has_run_once && !cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) stage2_unmap_vm(vcpu->kvm); vcpu_reset_hcr(vcpu); -- cgit v1.2.3 From 48c963e31bc664afafd31058483ea8390da63980 Mon Sep 17 00:00:00 2001 From: Jiang Yi Date: Wed, 15 Apr 2020 10:42:29 +0200 Subject: KVM: arm/arm64: Release kvm->mmu_lock in loop to prevent starvation Do cond_resched_lock() in stage2_flush_memslot() like what is done in unmap_stage2_range() and other places holding mmu_lock while processing a possibly large range of memory. Signed-off-by: Jiang Yi Signed-off-by: Marc Zyngier Reviewed-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20200415084229.29992-1-giangyi@amazon.com --- arch/arm64/kvm/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 29d8f24df944..917363375e8a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -422,6 +422,9 @@ static void stage2_flush_memslot(struct kvm *kvm, next = stage2_pgd_addr_end(kvm, addr, end); if (!stage2_pgd_none(kvm, *pgd)) stage2_flush_puds(kvm, pgd, addr, next); + + if (next != end) + cond_resched_lock(&kvm->mmu_lock); } while (pgd++, addr = next, addr != end); } -- cgit v1.2.3 From 9f2836146b11cdf98d5c8f8f71b0fce28fbd83c8 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 7 May 2020 20:35:45 +0800 Subject: KVM: arm64: Clean up the checking for huge mapping If we are checking whether the stage2 can map PAGE_SIZE, we don't have to do the boundary checks as both the host VMA and the guest memslots are page aligned. Bail the case easily. While we're at it, fixup a typo in the comment below. Signed-off-by: Suzuki K Poulose Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200507123546.1875-2-yuzenghui@huawei.com --- arch/arm64/kvm/mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 917363375e8a..ccb44e7d30d9 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1610,6 +1610,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, hva_t uaddr_start, uaddr_end; size_t size; + /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ + if (map_size == PAGE_SIZE) + return true; + size = memslot->npages * PAGE_SIZE; gpa_start = memslot->base_gfn << PAGE_SHIFT; @@ -1629,7 +1633,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| * +-----+--------------------+--------------------+---+ * - * memslot->base_gfn << PAGE_SIZE: + * memslot->base_gfn << PAGE_SHIFT: * +---+--------------------+--------------------+-----+ * |abc|def Stage-2 block | Stage-2 block |tvxyz| * +---+--------------------+--------------------+-----+ -- cgit v1.2.3 From 0529c9021252a58b6d3808da86986a614b900b1b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 7 May 2020 20:35:46 +0800 Subject: KVM: arm64: Unify handling THP backed host memory We support mapping host memory backed by PMD transparent hugepages at stage2 as huge pages. However the checks are now spread across two different places. Let us unify the handling of the THPs to keep the code cleaner (and future proof for PUD THP support). This patch moves transparent_hugepage_adjust() closer to the caller to avoid a forward declaration for fault_supports_stage2_huge_mappings(). Also, since we already handle the case where the host VA and the guest PA may not be aligned, the explicit VM_BUG_ON() is not required. Signed-off-by: Suzuki K Poulose Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200507123546.1875-3-yuzenghui@huawei.com --- arch/arm64/kvm/mmu.c | 115 +++++++++++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 55 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index ccb44e7d30d9..66eb8e3f6e8c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1375,47 +1375,6 @@ out: return ret; } -static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) -{ - kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *ipap >> PAGE_SHIFT; - - if (kvm_is_transparent_hugepage(pfn)) { - unsigned long mask; - /* - * The address we faulted on is backed by a transparent huge - * page. However, because we map the compound huge page and - * not the individual tail page, we need to transfer the - * refcount to the head page. We have to be careful that the - * THP doesn't start to split while we are adjusting the - * refcounts. - * - * We are sure this doesn't happen, because mmu_notifier_retry - * was successful and we are holding the mmu_lock, so if this - * THP is trying to split, it will be blocked in the mmu - * notifier before touching any of the pages, specifically - * before being able to call __split_huge_page_refcount(). - * - * We can therefore safely transfer the refcount from PG_tail - * to PG_head and switch the pfn from a tail page to the head - * page accordingly. - */ - mask = PTRS_PER_PMD - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - *ipap &= PMD_MASK; - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } - - return true; - } - - return false; -} - /** * stage2_wp_ptes - write protect PMD range * @pmd: pointer to pmd entry @@ -1663,6 +1622,59 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, (hva & ~(map_size - 1)) + map_size <= uaddr_end; } +/* + * Check if the given hva is backed by a transparent huge page (THP) and + * whether it can be mapped using block mapping in stage2. If so, adjust + * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently + * supported. This will need to be updated to support other THP sizes. + * + * Returns the size of the mapping. + */ +static unsigned long +transparent_hugepage_adjust(struct kvm_memory_slot *memslot, + unsigned long hva, kvm_pfn_t *pfnp, + phys_addr_t *ipap) +{ + kvm_pfn_t pfn = *pfnp; + + /* + * Make sure the adjustment is done only for THP pages. Also make + * sure that the HVA and IPA are sufficiently aligned and that the + * block map is contained within the memslot. + */ + if (kvm_is_transparent_hugepage(pfn) && + fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + /* + * The address we faulted on is backed by a transparent huge + * page. However, because we map the compound huge page and + * not the individual tail page, we need to transfer the + * refcount to the head page. We have to be careful that the + * THP doesn't start to split while we are adjusting the + * refcounts. + * + * We are sure this doesn't happen, because mmu_notifier_retry + * was successful and we are holding the mmu_lock, so if this + * THP is trying to split, it will be blocked in the mmu + * notifier before touching any of the pages, specifically + * before being able to call __split_huge_page_refcount(). + * + * We can therefore safely transfer the refcount from PG_tail + * to PG_head and switch the pfn from a tail page to the head + * page accordingly. + */ + *ipap &= PMD_MASK; + kvm_release_pfn_clean(pfn); + pfn &= ~(PTRS_PER_PMD - 1); + kvm_get_pfn(pfn); + *pfnp = pfn; + + return PMD_SIZE; + } + + /* Use page mapping if we cannot use block mapping. */ + return PAGE_SIZE; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -1776,20 +1788,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (vma_pagesize == PAGE_SIZE && !force_pte) { - /* - * Only PMD_SIZE transparent hugepages(THP) are - * currently supported. This code will need to be - * updated to support other THP sizes. - * - * Make sure the host VA and the guest IPA are sufficiently - * aligned and that the block is contained within the memslot. - */ - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && - transparent_hugepage_adjust(&pfn, &fault_ipa)) - vma_pagesize = PMD_SIZE; - } - + /* + * If we are not forced to use page mapping, check if we are + * backed by a THP and thus use block mapping if possible. + */ + if (vma_pagesize == PAGE_SIZE && !force_pte) + vma_pagesize = transparent_hugepage_adjust(memslot, hva, + &pfn, &fault_ipa); if (writable) kvm_set_pfn_dirty(pfn); -- cgit v1.2.3 From c862626e19efdc26b26481515470b160e8fe52f3 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Mon, 13 Apr 2020 20:20:23 +0800 Subject: KVM: arm64: Support enabling dirty log gradually in small chunks There is already support of enabling dirty log gradually in small chunks for x86 in commit 3c9bd4006bfc ("KVM: x86: enable dirty log gradually in small chunks"). This adds support for arm64. x86 still writes protect all huge pages when DIRTY_LOG_INITIALLY_ALL_SET is enabled. However, for arm64, both huge pages and normal pages can be write protected gradually by userspace. Under the Huawei Kunpeng 920 2.6GHz platform, I did some tests on 128G Linux VMs with different page size. The memory pressure is 127G in each case. The time taken of memory_global_dirty_log_start in QEMU is listed below: Page Size Before After Optimization 4K 650ms 1.8ms 2M 4ms 1.8ms 1G 2ms 1.8ms Besides the time reduction, the biggest improvement is that we will minimize the performance side effect (because of dissolving huge pages and marking memslots dirty) on guest after enabling dirty log. Signed-off-by: Keqian Zhu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200413122023.52583-1-zhukeqian1@huawei.com --- Documentation/virt/kvm/api.rst | 2 +- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/kvm/mmu.c | 12 ++++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index efbbe570aa9b..0017f63fa44f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5777,7 +5777,7 @@ will be initialized to 1 when created. This also improves performance because dirty logging can be enabled gradually in small chunks on the first call to KVM_CLEAR_DIRTY_LOG. KVM_DIRTY_LOG_INITIALLY_SET depends on KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on -x86 for now). +x86 and arm64 for now). KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 32c8a675e5a4..a723f84fab83 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -46,6 +46,9 @@ #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) +#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ + KVM_DIRTY_LOG_INITIALLY_SET) + DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int kvm_sve_max_vl; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 66eb8e3f6e8c..ddf85bf21897 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2277,8 +2277,16 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * allocated dirty_bitmap[], dirty pages will be tracked while the * memory slot is write protected. */ - if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) - kvm_mmu_wp_memory_region(kvm, mem->slot); + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + /* + * If we're with initial-all-set, we don't need to write + * protect any pages because they're all reported as dirty. + * Huge pages and normal pages will be write protect gradually. + */ + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { + kvm_mmu_wp_memory_region(kvm, mem->slot); + } + } } int kvm_arch_prepare_memory_region(struct kvm *kvm, -- cgit v1.2.3 From 5107000faa6e8c2b0ff7a91a6d1f010f84596cd2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 27 Apr 2020 15:15:07 +0100 Subject: KVM: arm64: Make KVM_CAP_MAX_VCPUS compatible with the selected GIC version KVM_CAP_MAX_VCPUS always return the maximum possible number of VCPUs, irrespective of the selected interrupt controller. This is pretty misleading for userspace that selects a GICv2 on a GICv3 system that supports v2 compat: It always gets a maximum of 512 VCPUs, even if the effective limit is 8. The 9th VCPU will fail to be created, which is unexpected as far as userspace is concerned. Fortunately, we already have the right information stashed in the kvm structure, and we can return it as requested. Reported-by: Ard Biesheuvel Signed-off-by: Marc Zyngier Tested-by: Alexandru Elisei Reviewed-by: Alexandru Elisei Link: https://lore.kernel.org/r/20200427141507.284985-1-maz@kernel.org --- arch/arm64/kvm/arm.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0ea9a0266d9a..e01d44df98df 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -95,6 +95,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, return r; } +static int kvm_arm_default_max_vcpus(void) +{ + return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; +} + /** * kvm_arch_init_vm - initializes a VM data structure * @kvm: pointer to the KVM struct @@ -128,8 +133,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.vmid.vmid_gen = 0; /* The maximum number of VCPUs is limited by the host's GIC model */ - kvm->arch.max_vcpus = vgic_present ? - kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; + kvm->arch.max_vcpus = kvm_arm_default_max_vcpus(); return ret; out_free_stage2_pgd: @@ -204,10 +208,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = num_online_cpus(); break; case KVM_CAP_MAX_VCPUS: - r = KVM_MAX_VCPUS; - break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + if (kvm) + r = kvm->arch.max_vcpus; + else + r = kvm_arm_default_max_vcpus(); break; case KVM_CAP_MSI_DEVID: if (!kvm) -- cgit v1.2.3 From 71b3ec5f221b8b3ff545639be83ddfcd5d7c9800 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Fri, 15 May 2020 16:20:56 +0100 Subject: KVM: arm64: Clean up cpu_init_hyp_mode() Pull bits of code to the only place where it is used. Remove empty function __cpu_init_stage2(). Remove redundant has_vhe() check since this function is nVHE-only. No functional changes intended. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200515152056.83158-1-dbrazdil@google.com --- arch/arm64/include/asm/kvm_asm.h | 2 ++ arch/arm64/include/asm/kvm_host.h | 35 ----------------------------------- arch/arm64/kvm/arm.c | 32 +++++++++++++++++++++++++++----- 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 59e314f38e43..0c9b5fc4ba0a 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -70,6 +70,8 @@ extern int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu); extern int __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu); +extern void __kvm_enable_ssbs(void); + extern u64 __vgic_v3_get_ich_vtr_el2(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a723f84fab83..69a338a390a6 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -533,39 +533,6 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) cpu_ctxt->sys_regs[MPIDR_EL1] = read_cpuid_mpidr(); } -void __kvm_enable_ssbs(void); - -static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, - unsigned long hyp_stack_ptr, - unsigned long vector_ptr) -{ - /* - * Calculate the raw per-cpu offset without a translation from the - * kernel's mapping to the linear mapping, and store it in tpidr_el2 - * so that we can use adr_l to access per-cpu variables in EL2. - */ - u64 tpidr_el2 = ((u64)this_cpu_ptr(&kvm_host_data) - - (u64)kvm_ksym_ref(kvm_host_data)); - - /* - * Call initialization code, and switch to the full blown HYP code. - * If the cpucaps haven't been finalized yet, something has gone very - * wrong, and hyp will crash and burn when it uses any - * cpus_have_const_cap() wrapper. - */ - BUG_ON(!system_capabilities_finalized()); - __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); - - /* - * Disabling SSBD on a non-VHE system requires us to enable SSBS - * at EL2. - */ - if (!has_vhe() && this_cpu_has_cap(ARM64_SSBS) && - arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { - kvm_call_hyp(__kvm_enable_ssbs); - } -} - static inline bool kvm_arch_requires_vhe(void) { /* @@ -601,8 +568,6 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -static inline void __cpu_init_stage2(void) {} - /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e01d44df98df..b0b569f2cdd0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1273,19 +1273,41 @@ static void cpu_init_hyp_mode(void) { phys_addr_t pgd_ptr; unsigned long hyp_stack_ptr; - unsigned long stack_page; unsigned long vector_ptr; + unsigned long tpidr_el2; /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); + /* + * Calculate the raw per-cpu offset without a translation from the + * kernel's mapping to the linear mapping, and store it in tpidr_el2 + * so that we can use adr_l to access per-cpu variables in EL2. + */ + tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) - + (unsigned long)kvm_ksym_ref(kvm_host_data)); + pgd_ptr = kvm_mmu_get_httbr(); - stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); - hyp_stack_ptr = stack_page + PAGE_SIZE; + hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; vector_ptr = (unsigned long)kvm_get_hyp_vector(); - __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); - __cpu_init_stage2(); + /* + * Call initialization code, and switch to the full blown HYP code. + * If the cpucaps haven't been finalized yet, something has gone very + * wrong, and hyp will crash and burn when it uses any + * cpus_have_const_cap() wrapper. + */ + BUG_ON(!system_capabilities_finalized()); + __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); + + /* + * Disabling SSBD on a non-VHE system requires us to enable SSBS + * at EL2. + */ + if (this_cpu_has_cap(ARM64_SSBS) && + arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { + kvm_call_hyp(__kvm_enable_ssbs); + } } static void cpu_hyp_reset(void) -- cgit v1.2.3 From 438f711ce1d889632467be80779c8f5762b107d7 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Fri, 15 May 2020 16:25:50 +0100 Subject: KVM: arm64: Fix incorrect comment on kvm_get_hyp_vector() The comment used to say that kvm_get_hyp_vector is only called on VHE systems. In fact, it is also called from the nVHE init function cpu_init_hyp_mode(). Fix the comment to stop confusing devs. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200515152550.83810-1-dbrazdil@google.com --- arch/arm64/include/asm/kvm_mmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 30b0e8d6b895..796f6a2e794a 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -473,7 +473,7 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, extern void *__kvm_bp_vect_base; extern int __kvm_harden_el2_vector_slot; -/* This is only called on a VHE system */ +/* This is called on both VHE and !VHE systems */ static inline void *kvm_get_hyp_vector(void) { struct bp_hardening_data *data = arm64_get_bp_hardening_data(); -- cgit v1.2.3 From 0a78791c0d12fcd5d3f486668defb9ab055e3729 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 19 May 2020 11:40:36 +0100 Subject: KVM: arm64: Remove obsolete kvm_virt_to_phys abstraction This abstraction was introduced to hide the difference between arm and arm64 but, with the former no longer supported, this abstraction can be removed and the canonical kernel API used directly instead. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier CC: Marc Zyngier CC: James Morse CC: Suzuki K Poulose Link: https://lore.kernel.org/r/20200519104036.259917-1-ascull@google.com --- arch/arm64/include/asm/kvm_mmu.h | 2 -- arch/arm64/kvm/mmu.c | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 796f6a2e794a..53bd4d517a4d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -363,8 +363,6 @@ static inline void __kvm_flush_dcache_pud(pud_t pud) } } -#define kvm_virt_to_phys(x) __pa_symbol(x) - void kvm_set_way_flush(struct kvm_vcpu *vcpu); void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index ddf85bf21897..a1f6bc70c4e4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2197,11 +2197,11 @@ int kvm_mmu_init(void) { int err; - hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); + hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); - hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); + hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); - hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); + hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); /* * We rely on the linker script to ensure at build time that the HYP -- cgit v1.2.3 From fc5d1f1a42fba6266ab95dc3b84937933a9b5a66 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 1 Dec 2018 08:41:28 -0800 Subject: KVM: arm64: vgic-v3: Take cpu_if pointer directly instead of vcpu If we move the used_lrs field to the version-specific cpu interface structure, the following functions only operate on the struct vgic_v3_cpu_if and not the full vcpu: __vgic_v3_save_state __vgic_v3_restore_state __vgic_v3_activate_traps __vgic_v3_deactivate_traps __vgic_v3_save_aprs __vgic_v3_restore_aprs This is going to be very useful for nested virt, so move the used_lrs field and change the prototypes and implementations of these functions to take the cpu_if parameter directly. No functional change. Reviewed-by: James Morse Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_hyp.h | 12 ++++++------ arch/arm64/kvm/hyp/switch.c | 8 ++++---- arch/arm64/kvm/hyp/vgic-v3-sr.c | 33 ++++++++++----------------------- arch/arm64/kvm/vgic/vgic-v2.c | 10 +++++----- arch/arm64/kvm/vgic/vgic-v3.c | 14 ++++++++------ arch/arm64/kvm/vgic/vgic.c | 25 +++++++++++++++++-------- include/kvm/arm_vgic.h | 5 ++++- 7 files changed, 54 insertions(+), 53 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index fe57f60f06a8..4f67b0cdffe8 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -56,12 +56,12 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); -void __vgic_v3_save_state(struct kvm_vcpu *vcpu); -void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); -void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu); -void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu); -void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu); -void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu); +void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); void __timer_enable_traps(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 8a1e81a400e0..c07a45643cd4 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -270,8 +270,8 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { - __vgic_v3_save_state(vcpu); - __vgic_v3_deactivate_traps(vcpu); + __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); } } @@ -279,8 +279,8 @@ static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu) static void __hyp_text __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { - __vgic_v3_activate_traps(vcpu); - __vgic_v3_restore_state(vcpu); + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); + __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); } } diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 6b85773e15c4..10ed539835c1 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -194,10 +194,9 @@ static u32 __hyp_text __vgic_v3_read_ap1rn(int n) return val; } -void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = cpu_if->used_lrs; /* * Make sure stores to the GIC via the memory mapped interface @@ -230,10 +229,9 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) } } -void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = cpu_if->used_lrs; int i; if (used_lrs || cpu_if->its_vpe.its_vm) { @@ -257,10 +255,8 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) } } -void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - /* * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a * Group0 interrupt (as generated in GICv2 mode) to be @@ -306,9 +302,8 @@ void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } -void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u64 val; if (!cpu_if->vgic_sre) { @@ -333,15 +328,11 @@ void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu) write_gicreg(0, ICH_HCR_EL2); } -void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if; u64 val; u32 nr_pre_bits; - vcpu = kern_hyp_va(vcpu); - cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - val = read_gicreg(ICH_VTR_EL2); nr_pre_bits = vtr_to_nr_pre_bits(val); @@ -370,15 +361,11 @@ void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu) } } -void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if; u64 val; u32 nr_pre_bits; - vcpu = kern_hyp_va(vcpu); - cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - val = read_gicreg(ICH_VTR_EL2); nr_pre_bits = vtr_to_nr_pre_bits(val); @@ -451,7 +438,7 @@ static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, u32 vmcr, u64 *lr_val) { - unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; + unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; u8 priority = GICv3_IDLE_PRIORITY; int i, lr = -1; @@ -490,7 +477,7 @@ static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, int intid, u64 *lr_val) { - unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; + unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; int i; for (i = 0; i < used_lrs; i++) { diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 621cc168fe3f..ebf53a4e1296 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -56,7 +56,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->vgic_hcr &= ~GICH_HCR_UIE; - for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { + for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) { u32 val = cpuif->vgic_lr[lr]; u32 cpuid, intid = val & GICH_LR_VIRTUALID; struct vgic_irq *irq; @@ -120,7 +120,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) vgic_put_irq(vcpu->kvm, irq); } - vgic_cpu->used_lrs = 0; + cpuif->used_lrs = 0; } /* @@ -427,7 +427,7 @@ out: static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = cpu_if->used_lrs; u64 elrsr; int i; @@ -448,7 +448,7 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) void vgic_v2_save_state(struct kvm_vcpu *vcpu) { void __iomem *base = kvm_vgic_global_state.vctrl_base; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; if (!base) return; @@ -463,7 +463,7 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; void __iomem *base = kvm_vgic_global_state.vctrl_base; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = cpu_if->used_lrs; int i; if (!base) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 3ccd6d3cb4d3..76e2d85789ed 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -39,7 +39,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->vgic_hcr &= ~ICH_HCR_UIE; - for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { + for (lr = 0; lr < cpuif->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; u32 intid, cpuid; struct vgic_irq *irq; @@ -111,7 +111,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) vgic_put_irq(vcpu->kvm, irq); } - vgic_cpu->used_lrs = 0; + cpuif->used_lrs = 0; } /* Requires the irq to be locked already */ @@ -662,10 +662,10 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) if (likely(cpu_if->vgic_sre)) kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); - kvm_call_hyp(__vgic_v3_restore_aprs, vcpu); + kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if)); if (has_vhe()) - __vgic_v3_activate_traps(vcpu); + __vgic_v3_activate_traps(cpu_if); WARN_ON(vgic_v4_load(vcpu)); } @@ -680,12 +680,14 @@ void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) void vgic_v3_put(struct kvm_vcpu *vcpu) { + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + WARN_ON(vgic_v4_put(vcpu, false)); vgic_v3_vmcr_sync(vcpu); - kvm_call_hyp(__vgic_v3_save_aprs, vcpu); + kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if)); if (has_vhe()) - __vgic_v3_deactivate_traps(vcpu); + __vgic_v3_deactivate_traps(cpu_if); } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 99b02ca730a8..c3643b7f101b 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -786,6 +786,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) int count; bool multi_sgi; u8 prio = 0xff; + int i = 0; lockdep_assert_held(&vgic_cpu->ap_list_lock); @@ -827,11 +828,14 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) } } - vcpu->arch.vgic_cpu.used_lrs = count; - /* Nuke remaining LRs */ - for ( ; count < kvm_vgic_global_state.nr_lr; count++) - vgic_clear_lr(vcpu, count); + for (i = count ; i < kvm_vgic_global_state.nr_lr; i++) + vgic_clear_lr(vcpu, i); + + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count; + else + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count; } static inline bool can_access_vgic_from_kernel(void) @@ -849,13 +853,13 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu) if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_save_state(vcpu); else - __vgic_v3_save_state(vcpu); + __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); } /* Sync back the hardware VGIC state into our emulation after a guest's run. */ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + int used_lrs; /* An empty ap_list_head implies used_lrs == 0 */ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) @@ -864,7 +868,12 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) if (can_access_vgic_from_kernel()) vgic_save_state(vcpu); - if (vgic_cpu->used_lrs) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; + else + used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; + + if (used_lrs) vgic_fold_lr_state(vcpu); vgic_prune_ap_list(vcpu); } @@ -874,7 +883,7 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu) if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_restore_state(vcpu); else - __vgic_v3_restore_state(vcpu); + __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); } /* Flush our emulation state into the GIC hardware before entering the guest. */ diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 69f4164d6477..a8d8fdcd3723 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -274,6 +274,8 @@ struct vgic_v2_cpu_if { u32 vgic_vmcr; u32 vgic_apr; u32 vgic_lr[VGIC_V2_MAX_LRS]; + + unsigned int used_lrs; }; struct vgic_v3_cpu_if { @@ -291,6 +293,8 @@ struct vgic_v3_cpu_if { * linking the Linux IRQ subsystem and the ITS together. */ struct its_vpe its_vpe; + + unsigned int used_lrs; }; struct vgic_cpu { @@ -300,7 +304,6 @@ struct vgic_cpu { struct vgic_v3_cpu_if vgic_v3; }; - unsigned int used_lrs; struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; raw_spinlock_t ap_list_lock; /* Protects the ap_list */ -- cgit v1.2.3 From 7ea90bdd70c9cf82dfbaa54e7d9f296928679224 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Jun 2019 11:17:00 +0100 Subject: KVM: arm64: Refactor vcpu_{read,write}_sys_reg Extract the direct HW accessors for later reuse. Reviewed-by: James Morse Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 128 +++++++++++++++++++++++++--------------------- 1 file changed, 71 insertions(+), 57 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 620eaf11e672..50e328ca1419 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -64,11 +64,8 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, return false; } -u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) +static bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) { - if (!vcpu->arch.sysregs_loaded_on_cpu) - goto immediate_read; - /* * System registers listed in the switch are not saved on every * exit from the guest but are only saved on vcpu_put. @@ -79,40 +76,37 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) * thread when emulating cross-VCPU communication. */ switch (reg) { - case CSSELR_EL1: return read_sysreg_s(SYS_CSSELR_EL1); - case SCTLR_EL1: return read_sysreg_s(SYS_SCTLR_EL12); - case ACTLR_EL1: return read_sysreg_s(SYS_ACTLR_EL1); - case CPACR_EL1: return read_sysreg_s(SYS_CPACR_EL12); - case TTBR0_EL1: return read_sysreg_s(SYS_TTBR0_EL12); - case TTBR1_EL1: return read_sysreg_s(SYS_TTBR1_EL12); - case TCR_EL1: return read_sysreg_s(SYS_TCR_EL12); - case ESR_EL1: return read_sysreg_s(SYS_ESR_EL12); - case AFSR0_EL1: return read_sysreg_s(SYS_AFSR0_EL12); - case AFSR1_EL1: return read_sysreg_s(SYS_AFSR1_EL12); - case FAR_EL1: return read_sysreg_s(SYS_FAR_EL12); - case MAIR_EL1: return read_sysreg_s(SYS_MAIR_EL12); - case VBAR_EL1: return read_sysreg_s(SYS_VBAR_EL12); - case CONTEXTIDR_EL1: return read_sysreg_s(SYS_CONTEXTIDR_EL12); - case TPIDR_EL0: return read_sysreg_s(SYS_TPIDR_EL0); - case TPIDRRO_EL0: return read_sysreg_s(SYS_TPIDRRO_EL0); - case TPIDR_EL1: return read_sysreg_s(SYS_TPIDR_EL1); - case AMAIR_EL1: return read_sysreg_s(SYS_AMAIR_EL12); - case CNTKCTL_EL1: return read_sysreg_s(SYS_CNTKCTL_EL12); - case PAR_EL1: return read_sysreg_s(SYS_PAR_EL1); - case DACR32_EL2: return read_sysreg_s(SYS_DACR32_EL2); - case IFSR32_EL2: return read_sysreg_s(SYS_IFSR32_EL2); - case DBGVCR32_EL2: return read_sysreg_s(SYS_DBGVCR32_EL2); + case CSSELR_EL1: *val = read_sysreg_s(SYS_CSSELR_EL1); break; + case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break; + case ACTLR_EL1: *val = read_sysreg_s(SYS_ACTLR_EL1); break; + case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break; + case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break; + case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break; + case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break; + case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break; + case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break; + case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break; + case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break; + case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break; + case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break; + case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break; + case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break; + case PAR_EL1: *val = read_sysreg_s(SYS_PAR_EL1); break; + case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break; + case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break; + default: return false; } -immediate_read: - return __vcpu_sys_reg(vcpu, reg); + return true; } -void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) +static bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) { - if (!vcpu->arch.sysregs_loaded_on_cpu) - goto immediate_write; - /* * System registers listed in the switch are not restored on every * entry to the guest but are only restored on vcpu_load. @@ -122,32 +116,52 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) * once, before running the VCPU, and never changed later. */ switch (reg) { - case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); return; - case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); return; - case ACTLR_EL1: write_sysreg_s(val, SYS_ACTLR_EL1); return; - case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); return; - case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); return; - case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); return; - case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); return; - case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); return; - case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); return; - case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); return; - case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); return; - case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); return; - case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); return; - case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12); return; - case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); return; - case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); return; - case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); return; - case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); return; - case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); return; - case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); return; - case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); return; - case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); return; - case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); return; + case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); break; + case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; + case ACTLR_EL1: write_sysreg_s(val, SYS_ACTLR_EL1); break; + case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; + case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; + case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; + case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; + case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; + case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; + case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; + case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break; + case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break; + case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break; + case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break; + case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; + case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; + case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; + case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; + default: return false; } -immediate_write: + return true; +} + +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) +{ + u64 val = 0x8badf00d8badf00d; + + if (vcpu->arch.sysregs_loaded_on_cpu && + __vcpu_read_sys_reg_from_cpu(reg, &val)) + return val; + + return __vcpu_sys_reg(vcpu, reg); +} + +void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) +{ + if (vcpu->arch.sysregs_loaded_on_cpu && + __vcpu_write_sys_reg_to_cpu(val, reg)) + return; + __vcpu_sys_reg(vcpu, reg) = val; } -- cgit v1.2.3 From 7ccadf23b8613c946f67e2b3c5e7f436858021aa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 27 Jan 2020 11:54:42 +0000 Subject: KVM: arm64: Add missing reset handlers for PMU emulation As we're about to become a bit more harsh when it comes to the lack of reset callbacks, let's add the missing PMU reset handlers. Note that these only cover *CLR registers that were always covered by their *SET counterpart, so there is no semantic change here. Reviewed-by: James Morse Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 50e328ca1419..9d28eabbdf97 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1546,7 +1546,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, { SYS_DESC(SYS_PMINTENSET_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 }, - { SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, NULL, PMINTENSET_EL1 }, + { SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, @@ -1585,8 +1585,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, PMCR_EL0 }, { SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, - { SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, NULL, PMCNTENSET_EL0 }, - { SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, NULL, PMOVSSET_EL0 }, + { SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, + { SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, reset_unknown, PMOVSSET_EL0 }, { SYS_DESC(SYS_PMSWINC_EL0), access_pmswinc, reset_unknown, PMSWINC_EL0 }, { SYS_DESC(SYS_PMSELR_EL0), access_pmselr, reset_unknown, PMSELR_EL0 }, { SYS_DESC(SYS_PMCEID0_EL0), access_pmceid }, -- cgit v1.2.3 From bb44a8dbea259bc1dc2177b4bc90ca4e8fcbf659 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 27 Jan 2020 11:21:17 +0000 Subject: KVM: arm64: Move sysreg reset check to boot time Our sysreg reset check has become a bit silly, as it only checks whether a reset callback actually exists for a given sysreg entry, and apply the method if available. Doing the check at each vcpu reset is pretty dumb, as the tables never change. It is thus perfectly possible to do the same checks at boot time. This also allows us to introduce a sparse sys_regs[] array, something that will be required with ARMv8.4-NV. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 72 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9d28eabbdf97..ad1d57501d6d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2087,12 +2087,37 @@ static const struct sys_reg_desc cp15_64_regs[] = { { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, }; +static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, + bool is_32) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + if (!is_32 && table[i].reg && !table[i].reset) { + kvm_err("sys_reg table %p entry %d has lacks reset\n", + table, i); + return 1; + } + + if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { + kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1); + return 1; + } + } + + return 0; +} + /* Target specific emulation tables */ static struct kvm_sys_reg_target_table *target_tables[KVM_ARM_NUM_TARGETS]; void kvm_register_target_sys_reg_table(unsigned int target, struct kvm_sys_reg_target_table *table) { + if (check_sysreg_table(table->table64.table, table->table64.num, false) || + check_sysreg_table(table->table32.table, table->table32.num, true)) + return; + target_tables[target] = table; } @@ -2378,19 +2403,13 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, } static void reset_sys_reg_descs(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *table, size_t num, - unsigned long *bmap) + const struct sys_reg_desc *table, size_t num) { unsigned long i; for (i = 0; i < num; i++) - if (table[i].reset) { - int reg = table[i].reg; - + if (table[i].reset) table[i].reset(vcpu, &table[i]); - if (reg > 0 && reg < NR_SYS_REGS) - set_bit(reg, bmap); - } } /** @@ -2846,32 +2865,18 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } -static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n) -{ - unsigned int i; - - for (i = 1; i < n; i++) { - if (cmp_sys_reg(&table[i-1], &table[i]) >= 0) { - kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1); - return 1; - } - } - - return 0; -} - void kvm_sys_reg_table_init(void) { unsigned int i; struct sys_reg_desc clidr; /* Make sure tables are unique and in order. */ - BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs))); - BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs))); - BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs))); - BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs))); - BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs))); - BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs))); + BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false)); + BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true)); + BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true)); + BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true)); + BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true)); + BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false)); /* We abuse the reset function to overwrite the table itself. */ for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) @@ -2907,17 +2912,10 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) { size_t num; const struct sys_reg_desc *table; - DECLARE_BITMAP(bmap, NR_SYS_REGS) = { 0, }; /* Generic chip reset first (so target could override). */ - reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs), bmap); + reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); table = get_target_table(vcpu->arch.target, true, &num); - reset_sys_reg_descs(vcpu, table, num, bmap); - - for (num = 1; num < NR_SYS_REGS; num++) { - if (WARN(!test_bit(num, bmap), - "Didn't reset __vcpu_sys_reg(%zi)\n", num)) - break; - } + reset_sys_reg_descs(vcpu, table, num); } -- cgit v1.2.3 From 349c330ced9764667678f4d2804fd4ebc16110c9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 12 Apr 2020 18:49:31 +0100 Subject: KVM: arm64: Don't use empty structures as CPU reset state Keeping empty structure as the vcpu state initializer is slightly wasteful: we only want to set pstate, and zero everything else. Just do that. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/reset.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 658f3a79617b..865c8aa670bc 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -36,15 +36,11 @@ static u32 kvm_ipa_limit; /* * ARMv8 Reset Values */ -static const struct kvm_regs default_regs_reset = { - .regs.pstate = (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | - PSR_F_BIT | PSR_D_BIT), -}; +#define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \ + PSR_F_BIT | PSR_D_BIT) -static const struct kvm_regs default_regs_reset32 = { - .regs.pstate = (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | - PSR_AA32_I_BIT | PSR_AA32_F_BIT), -}; +#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ + PSR_AA32_I_BIT | PSR_AA32_F_BIT) static bool cpu_has_32bit_el1(void) { @@ -257,9 +253,9 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { - const struct kvm_regs *cpu_reset; int ret = -EINVAL; bool loaded; + u32 pstate; /* Reset PMU outside of the non-preemptible section */ kvm_pmu_vcpu_reset(vcpu); @@ -290,16 +286,17 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { if (!cpu_has_32bit_el1()) goto out; - cpu_reset = &default_regs_reset32; + pstate = VCPU_RESET_PSTATE_SVC; } else { - cpu_reset = &default_regs_reset; + pstate = VCPU_RESET_PSTATE_EL1; } break; } /* Reset core registers */ - memcpy(vcpu_gp_regs(vcpu), cpu_reset, sizeof(*cpu_reset)); + memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); + vcpu_gp_regs(vcpu)->regs.pstate = pstate; /* Reset system registers */ kvm_reset_sys_regs(vcpu); -- cgit v1.2.3 From d9d7d84d9906e1bc886c5e0fc66aaad26008264b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 21 Apr 2020 18:32:02 +0100 Subject: KVM: arm64: Parametrize exception entry with a target EL We currently assume that an exception is delivered to EL1, always. Once we emulate EL2, this no longer will be the case. To prepare for this, add a target_mode parameter. While we're at it, merge the computing of the target PC and PSTATE in a single function that updates both PC and CPSR after saving their previous values in the corresponding ELR/SPSR. This ensures that they are updated in the correct order (a pretty common source of bugs...). Reviewed-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/ptrace.h | 1 + arch/arm64/kvm/inject_fault.c | 75 +++++++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index bf57308fcd63..953b6a1ce549 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -35,6 +35,7 @@ #define GIC_PRIO_PSR_I_SET (1 << 4) /* Additional SPSR bits not exposed in the UABI */ +#define PSR_MODE_THREAD_BIT (1 << 0) #define PSR_IL_BIT (1 << 20) /* AArch32-specific ptrace requests */ diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 6aafc2825c1c..e21fdd93027a 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -26,28 +26,12 @@ enum exception_type { except_type_serror = 0x180, }; -static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type) -{ - u64 exc_offset; - - switch (*vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT)) { - case PSR_MODE_EL1t: - exc_offset = CURRENT_EL_SP_EL0_VECTOR; - break; - case PSR_MODE_EL1h: - exc_offset = CURRENT_EL_SP_ELx_VECTOR; - break; - case PSR_MODE_EL0t: - exc_offset = LOWER_EL_AArch64_VECTOR; - break; - default: - exc_offset = LOWER_EL_AArch32_VECTOR; - } - - return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type; -} - /* + * This performs the exception entry at a given EL (@target_mode), stashing PC + * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE. + * The EL passed to this function *must* be a non-secure, privileged mode with + * bit 0 being set (PSTATE.SP == 1). + * * When an exception is taken, most PSTATE fields are left unchanged in the * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx @@ -59,10 +43,35 @@ static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type) * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from * MSB to LSB. */ -static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu) +static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, + enum exception_type type) { - unsigned long sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); - unsigned long old, new; + unsigned long sctlr, vbar, old, new, mode; + u64 exc_offset; + + mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + + if (mode == target_mode) + exc_offset = CURRENT_EL_SP_ELx_VECTOR; + else if ((mode | PSR_MODE_THREAD_BIT) == target_mode) + exc_offset = CURRENT_EL_SP_EL0_VECTOR; + else if (!(mode & PSR_MODE32_BIT)) + exc_offset = LOWER_EL_AArch64_VECTOR; + else + exc_offset = LOWER_EL_AArch32_VECTOR; + + switch (target_mode) { + case PSR_MODE_EL1h: + vbar = vcpu_read_sys_reg(vcpu, VBAR_EL1); + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); + break; + default: + /* Don't do that */ + BUG(); + } + + *vcpu_pc(vcpu) = vbar + exc_offset + type; old = *vcpu_cpsr(vcpu); new = 0; @@ -105,9 +114,10 @@ static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu) new |= PSR_I_BIT; new |= PSR_F_BIT; - new |= PSR_MODE_EL1h; + new |= target_mode; - return new; + *vcpu_cpsr(vcpu) = new; + vcpu_write_spsr(vcpu, old); } static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) @@ -116,11 +126,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr bool is_aarch32 = vcpu_mode_is_32bit(vcpu); u32 esr = 0; - vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); - *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); - - *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); - vcpu_write_spsr(vcpu, cpsr); + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); vcpu_write_sys_reg(vcpu, addr, FAR_EL1); @@ -148,14 +154,9 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr static void inject_undef64(struct kvm_vcpu *vcpu) { - unsigned long cpsr = *vcpu_cpsr(vcpu); u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); - *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); - - *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); - vcpu_write_spsr(vcpu, cpsr); + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); /* * Build an unknown exception, depending on the instruction -- cgit v1.2.3 From 8f7f4fe756bd5cfef73cf8234445081385bdbf7d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 27 May 2020 11:38:57 +0100 Subject: KVM: arm64: Drop obsolete comment about sys_reg ordering The general comment about keeping the enum order in sync with the save/restore code has been obsolete for many years now. Just drop it. Note that there are other ordering requirements in the enum, such as the PtrAuth and PMU registers, which are still valid. Reported-by: James Morse Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 69a338a390a6..59029e90b557 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -115,12 +115,8 @@ struct kvm_vcpu_fault_info { u64 disr_el1; /* Deferred [SError] Status Register */ }; -/* - * 0 is reserved as an invalid value. - * Order should be kept in sync with the save/restore code. - */ enum vcpu_sysreg { - __INVALID_SYSREG__, + __INVALID_SYSREG__, /* 0 is reserved as an invalid value */ MPIDR_EL1, /* MultiProcessor Affinity Register */ CSSELR_EL1, /* Cache Size Selection Register */ SCTLR_EL1, /* System Control Register */ -- cgit v1.2.3